diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
deleted file mode 100644
index 0e46bd9dfd..0000000000
--- a/.github/CODEOWNERS
+++ /dev/null
@@ -1,21 +0,0 @@
-# Remember that the last applicable rule for any given file is the only one
-# that applies.
-
-# Default rule: @regisss gets everything.
-*                                                   @regisss
-
-/optimum/habana/transformers/models/albert	    @skaulintel	          @ANSHUMAN87		
-/optimum/habana/transformers/models/bloom	    @dvarshney-habana			
-/optimum/habana/transformers/models/esm	            @bzhu-habana			
-/optimum/habana/transformers/models/falcon	    @libinta	          @mandy-li	       @dvarshney-habana	
-/optimum/habana/transformers/models/gpt2	    @ZhaiFeiyue			
-/optimum/habana/transformers/models/gpt_bigcode	    @ZhaiFeiyue			
-/optimum/habana/transformers/models/gpt_neox	    @ZhaiFeiyue	          @mandy-li		
-/optimum/habana/transformers/models/gptj	    @ZhaiFeiyue			
-/optimum/habana/transformers/models/llama	    @mandy-li	          @libinta	       @dvarshney-habana	
-/optimum/habana/transformers/models/mpt	            @mandy-li			
-/optimum/habana/transformers/models/opt	            @ZhaiFeiyue			
-/optimum/habana/transformers/models/t5	            @bhargaveede		
-/optimum/habana/transformers/models/vit             @ZhaiFeiyue           @jychen-habana		
-/optimum/habana/transformers/models/wav2vec2	    @hlahkar	          @vivekgoe		
-/optimum/habana/transformers/generation/	    @ssarkar2	          @bhargaveede	       @vivekgoe	
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
deleted file mode 100644
index b9f1aaa44a..0000000000
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: "\U0001F41B Bug Report"
-description: Submit a bug report to help us improve Optimum Habana
-labels: [ "bug" ]
-body:
-  - type: textarea
-    id: system-info
-    attributes:
-      label: System Info
-      description: Please share your system info with us.
-      render: shell
-      placeholder: Optimum Habana version, SynapseAI version, Docker image...
-    validations:
-      required: true
-
-  - type: checkboxes
-    id: information-scripts-examples
-    attributes:
-      label: Information
-      description: 'The problem arises when using:'
-      options:
-        - label: "The official example scripts"
-        - label: "My own modified scripts"
-
-  - type: checkboxes
-    id: information-tasks
-    attributes:
-      label: Tasks
-      description: "The tasks I am working on are:"
-      options:
-        - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
-        - label: "My own task or dataset (give details below)"
-
-  - type: textarea
-    id: reproduction
-    validations:
-      required: true
-    attributes:
-      label: Reproduction
-      description: |
-        Please provide a code sample that reproduces the problem you ran into.
-        If you have error messages or stack traces please provide them here as well.
-        Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
-        Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
-
-      placeholder: |
-        Steps to reproduce the behavior:
-
-          1.
-          2.
-          3.
-
-
-  - type: textarea
-    id: expected-behavior
-    validations:
-      required: true
-    attributes:
-      label: Expected behavior
-      description: "A clear and concise description of what you would expect to happen."
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
deleted file mode 100644
index f5a53d409d..0000000000
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ /dev/null
@@ -1,9 +0,0 @@
-blank_issues_enabled: true
-version: 2.1
-contact_links:
-  - name: Website Related
-    url: https://github.com/huggingface/hub-docs/issues
-    about: Feature requests and bug reports related to the website
-  - name: Forum
-    url: https://discuss.huggingface.co/c/optimum/
-    about: General usage questions and community discussions
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
deleted file mode 100644
index 69a7245e34..0000000000
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: "\U0001F680 Feature request"
-description: Submit a proposal/request for a new Optimum Habana feature
-labels: [ "feature" ]
-body:
-  - type: textarea
-    id: feature-request
-    validations:
-      required: true
-    attributes:
-      label: Feature request
-      description: |
-        A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
-
-  - type: textarea
-    id: motivation
-    validations:
-      required: true
-    attributes:
-      label: Motivation
-      description: |
-        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
-
-
-  - type: textarea
-    id: contribution
-    validations:
-      required: true
-    attributes:
-      label: Your contribution
-      description: |
-        Is there any way that you could help, e.g. by submitting a PR?
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 0fe2019442..0000000000
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# What does this PR do?
-
-<!--
-Congratulations! You've made it this far! You're not quite done yet though.
-
-Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
-
-Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
-
-Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
--->
-
-<!-- Remove if not applicable -->
-
-Fixes # (issue)
-
-
-## Before submitting
-- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
-- [ ] Did you make sure to update the documentation with your changes?
-- [ ] Did you write any new necessary tests?
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
deleted file mode 100644
index 9d6b2d4005..0000000000
--- a/.github/workflows/build_pr_documentation.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Build PR documentation
-
-on:
-  pull_request:
-    branches: [ main ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  build_documentation:
-    runs-on: ubuntu-20.04
-    env:
-      COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
-      PR_NUMBER: ${{ github.event.number }}
-      EVENT_CONTEXT: ${{ toJSON(github.event) }}
-      PR_CLONE_URL: ${{ github.event.pull_request.head.repo.clone_url }}
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          repository: 'huggingface/doc-builder'
-          path: doc-builder
-
-      - uses: actions/checkout@v2
-        with:
-          repository: 'huggingface/optimum-habana'
-          path: optimum-habana
-
-      - name: Setup environment
-        run: |
-          pip uninstall -y doc-builder
-          cd doc-builder
-          git pull origin main
-          pip install .
-          cd ..
-
-      - name: Make documentation
-        run: |
-          cd optimum-habana
-          make doc BUILD_DIR=habana-doc-build VERSION=pr_$PR_NUMBER COMMIT_SHA_SUBPACKAGE=$COMMIT_SHA CLONE_URL=$PR_CLONE_URL
-          cd ..
-
-      - name: Save commit_sha & pr_number
-        run: |
-          cd optimum-habana
-          sudo chmod -R ugo+rwx habana-doc-build
-          cd habana-doc-build
-          sudo mv optimum.habana optimum-habana
-          echo ${{ env.COMMIT_SHA }} > ./commit_sha
-          echo ${{ env.PR_NUMBER }} > ./pr_number
-
-      - uses: actions/upload-artifact@v3
-        with:
-          name: doc-build-artifact
-          path: optimum-habana/habana-doc-build/
diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
deleted file mode 100644
index b734c564d3..0000000000
--- a/.github/workflows/check_code_quality.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Check code quality
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  check:
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ['3.10']
-        os: [ubuntu-22.04]
-    name: Check code quality
-    runs-on: ${{ matrix.os }}
-    steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Create and start a virtual environment
-      run: |
-        python -m venv venv
-        source venv/bin/activate
-    - name: Install dependencies
-      run: |
-        source venv/bin/activate
-        pip install --upgrade pip
-        pip install ruff
-    - name: Check style with ruff
-      run: |
-        source venv/bin/activate
-        ruff check . setup.py
-        ruff format --check . setup.py
diff --git a/.github/workflows/fast_tests.yml b/.github/workflows/fast_tests.yml
deleted file mode 100644
index 24af938f01..0000000000
--- a/.github/workflows/fast_tests.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: Unit and integration tests
-
-
-on:
-  workflow_dispatch:
-  pull_request:
-    branches: [ main ]
-  push:
-    branches: [ main ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  transformers:
-    name: Run tests for optimum.habana.transformers
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/fast_tests.sh
-  diffusers:
-    name: Run tests for optimum.habana.diffusers
-    needs:
-      - transformers # required to wait for the previous tests to finish
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/fast_tests_diffusers.sh
diff --git a/.github/workflows/slow_tests.yml b/.github/workflows/slow_tests.yml
deleted file mode 100644
index 82914019e1..0000000000
--- a/.github/workflows/slow_tests.yml
+++ /dev/null
@@ -1,270 +0,0 @@
-name: Non-regression tests
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 21 * * 0-5'  # every Sunday to Friday at 11pm CET (10pm winter time)
-    - cron: '0 21 * * 6'  # every Saturday at 1am CET (midnight winter time)
-
-concurrency:
-  group: ${{ github.workflow }}
-
-jobs:
-  example-diff:
-    name: Test examples differences
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/example_diff_tests.sh
-  stable-diffusion:
-    name: Test Stable Diffusion
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - example-diff  # run the job when the previous test job is done
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_diffusers.sh
-  deepspeed:
-    name: Test DeepSpeed models
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - example-diff
-      - stable-diffusion  # run the job when the previous test job is done
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_deepspeed.sh
-  multi-card:
-    name: Test multi-card models
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - example-diff
-      - deepspeed  # run the job when the previous test job is done
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_8x.sh
-  single-card:
-    name: Test single-card models
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - example-diff
-      - deepspeed
-      - multi-card  # run the job when the previous test jobs are done
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_1x.sh
-  albert-xxl-single-card:
-    name: Test single-card ALBERT XXL
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - example-diff
-      - deepspeed
-      - multi-card
-      - single-card  # run the job when the previous test jobs are done
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        if: github.event.schedule == '0 21 * * 6'
-        uses: actions/checkout@v2
-      - name: Pull image
-        if: github.event.schedule == '0 21 * * 6'
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run test
-        if: github.event.schedule == '0 21 * * 6'
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/albert_xxl_1x.sh
-      - name: Warning
-        if: github.event.schedule != '0 21 * * 6'
-        run: echo "ALBERT XXL 1x is only tested on Saturdays."
-  text-generation:
-    name: Test text-generation example
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - example-diff
-      - deepspeed
-      - multi-card
-      - single-card
-      - albert-xxl-single-card  # run the job when the previous test jobs are done
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
-  trl:
-    name: Test TRL integration
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - example-diff
-      - deepspeed
-      - multi-card
-      - single-card
-      - albert-xxl-single-card
-      - text-generation  # run the job when the previous test jobs are done
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_trl.sh
-  sentence-transformers:
-    name: Test Sentence Transformers integration
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - example-diff
-      - deepspeed
-      - multi-card
-      - single-card
-      - albert-xxl-single-card
-      - text-generation
-      - trl  # run the job when the previous test jobs are done
-    runs-on: [self-hosted, linux, x64, gaudi-habana] # run the job on the newly created runner
-    steps:
-      - name: Checkout Optimum Habana
-        uses: actions/checkout@v2
-        with:
-          repository: 'huggingface/optimum-habana'
-          path: optimum-habana
-      - name: Checkout Sentence Transformers
-        uses: actions/checkout@v2
-        with:
-          repository: 'UKPLab/sentence-transformers'
-          path: sentence-transformers
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash optimum-habana/tests/ci/sentence_transformers.sh
diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
deleted file mode 100644
index 623b62f325..0000000000
--- a/.github/workflows/slow_tests_gaudi2.yml
+++ /dev/null
@@ -1,228 +0,0 @@
-name: (Gaudi2) Non-regression tests
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 23 * * 3,6'  # every Wednesday and Saturday at 1am CET (midnight winter time)
-
-concurrency:
-  group: ${{ github.workflow }}
-
-jobs:
-  stable-diffusion:
-    name: Test Stable Diffusion
-    runs-on: [self-hosted, linux, x64, gaudi2]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            -e GAUDI2_CI=1 \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_diffusers.sh
-  deepspeed:
-    name: Test DeepSpeed models
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - stable-diffusion  # run the job when the previous test job is done
-    runs-on: [self-hosted, linux, x64, gaudi2]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            -e GAUDI2_CI=1 \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_deepspeed.sh
-  fsdp:
-    name: Test FSDP models
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - deepspeed  # run the job when the previous test job is done
-    runs-on: [self-hosted, linux, x64, gaudi2]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            -e GAUDI2_CI=1 \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            make slow_tests_fsdp TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
-  multi-card:
-    name: Test multi-card models
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - fsdp  # run the job when the previous test job is done
-    runs-on: [self-hosted, linux, x64, gaudi2]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            -e GAUDI2_CI=1 \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_8x.sh
-  single-card:
-    name: Test single-card models
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - deepspeed
-      - multi-card  # run the job when the previous test jobs are done
-    runs-on: [self-hosted, linux, x64, gaudi2]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            -e GAUDI2_CI=1 \
-            -e RUN_ALBERT_XXL_1X=1 \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_1x.sh
-  text-generation:
-    name: Test text-generation example
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - deepspeed
-      - multi-card
-      - single-card  # run the job when the previous test jobs are done
-    runs-on: [self-hosted, linux, x64, gaudi2]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            -e GAUDI2_CI=1 \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
-  trl:
-    name: Test TRL integration
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - text-generation
-    runs-on: [self-hosted, linux, x64, gaudi2]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            -e GAUDI2_CI=1 \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash tests/ci/slow_tests_trl.sh
-  sentence-transformers:
-    name: Test Sentence Transformers integration
-    if: ${{ !cancelled() && (success() || failure()) }}
-    needs:
-      - trl
-    runs-on: [self-hosted, linux, x64, gaudi2]
-    steps:
-      - name: Checkout Optimum Habana
-        uses: actions/checkout@v2
-        with:
-          repository: 'huggingface/optimum-habana'
-          path: optimum-habana
-      - name: Checkout Sentence Transformers
-        uses: actions/checkout@v2
-        with:
-          repository: 'UKPLab/sentence-transformers'
-          path: sentence-transformers
-      - name: Pull image
-        run: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-      - name: Run tests
-        run: |
-            docker run \
-            -v $PWD:/root/workspace \
-            --workdir=/root/workspace \
-            --runtime=habana \
-            -e HABANA_VISIBLE_DEVICES=all \
-            -e GAUDI2_CI=1 \
-            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-            --cap-add=sys_nice \
-            --net=host \
-            --ipc=host \
-            vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest \
-            /bin/bash optimum-habana/tests/ci/sentence_transformers.sh
diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
deleted file mode 100644
index 9cbbf68037..0000000000
--- a/.github/workflows/trufflehog.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-on:
-  push:
-
-name: Secret Leaks
-
-jobs:
-  trufflehog:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-    - name: Secret Scanning
-      uses: trufflesecurity/trufflehog@main
diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
deleted file mode 100644
index 3c27ba66ea..0000000000
--- a/.github/workflows/upload_pr_documentation.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-name: Upload PR Documentation
-
-on:
-  workflow_run:
-    workflows: ["Build PR Documentation"]
-    types:
-      - completed
-
-jobs:
-  build:
-    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
-    with:
-      package_name: optimum-habana
-    secrets:
-      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index fcac20e876..0000000000
--- a/.gitignore
+++ /dev/null
@@ -1,135 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# SynapseAI logs
-.local.synapse_log*
-
-# ruff
-.ruff_cache
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 261eeb9e9f..0000000000
--- a/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index d786fdfc6d..0000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,16 +0,0 @@
-#  Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-include README.md
-include LICENSE
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 6e87a399a3..0000000000
--- a/Makefile
+++ /dev/null
@@ -1,133 +0,0 @@
-#  Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-SHELL := /bin/bash
-CURRENT_DIR = $(shell pwd)
-DEFAULT_CLONE_URL := https://github.com/huggingface/optimum-habana.git
-# If CLONE_URL is empty, revert to DEFAULT_CLONE_URL
-REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
-
-
-.PHONY:	style test
-
-# Run code quality checks
-style_check: clean
-	pip install -U pip ruff
-	ruff check . setup.py
-	ruff format --check . setup.py
-
-style: clean
-	pip install -U pip ruff
-	ruff check . setup.py --fix
-	ruff format . setup.py
-
-# Run unit and integration tests
-fast_tests:
-	python -m pip install .[tests]
-	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py
-
-# Run unit and integration tests related to Diffusers
-fast_tests_diffusers:
-	python -m pip install .[tests]
-	python -m pytest tests/test_diffusers.py
-
-# Run single-card non-regression tests
-slow_tests_1x: test_installs
-	python -m pytest tests/test_examples.py -v -s -k "single_card"
-	python -m pytest tests/test_pipeline.py
-
-# Run multi-card non-regression tests
-slow_tests_8x: test_installs
-	python -m pytest tests/test_examples.py -v -s -k "multi_card"
-
-# Run DeepSpeed non-regression tests
-slow_tests_deepspeed: test_installs
-	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-	python -m pytest tests/test_examples.py -v -s -k "deepspeed"
-
-slow_tests_diffusers: test_installs
-	python -m pytest tests/test_diffusers.py -v -s -k "test_no_"
-	python -m pytest tests/test_diffusers.py -v -s -k "test_textual_inversion"
-	python -m pip install peft==0.7.0
-	python -m pytest tests/test_diffusers.py -v -s -k "test_train_text_to_image_"
-	python -m pytest tests/test_diffusers.py -v -s -k "test_train_controlnet"
-
-# Run text-generation non-regression tests
-slow_tests_text_generation_example: test_installs
-	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-	python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN)
-
-# Run image-to-text non-regression tests
-slow_tests_image_to_text_example: test_installs
-	python -m pytest tests/test_image_to_text_example.py -v -s --token $(TOKEN)
-
-slow_tests_fsdp: test_installs
-	python -m pytest tests/test_fsdp_examples.py -v -s --token $(TOKEN)
-
-slow_tests_trl: test_installs
-	python -m pip install trl==0.8.6
-	python -m pip install peft==0.7.0
-	python -m pytest tests/test_trl.py -v -s -k "test_calculate_loss"
-
-slow_tests_object_segmentation: test_installs
-	python -m pytest tests/test_object_segmentation.py
-
-# Check if examples are up to date with the Transformers library
-example_diff_tests: test_installs
-	python -m pytest tests/test_examples_match_transformers.py
-
-# Utilities to release to PyPi
-build_dist_install_tools:
-	python -m pip install build
-	python -m pip install twine
-
-build_dist:
-	rm -fr build
-	rm -fr dist
-	python -m build
-
-pypi_upload: build_dist
-	python -m twine upload dist/*
-
-build_doc_docker_image:
-	docker build -t doc_maker --build-arg commit_sha=$(COMMIT_SHA_SUBPACKAGE) --build-arg clone_url=$(REAL_CLONE_URL) ./docs
-
-doc: build_doc_docker_image
-	@test -n "$(BUILD_DIR)" || (echo "BUILD_DIR is empty." ; exit 1)
-	@test -n "$(VERSION)" || (echo "VERSION is empty." ; exit 1)
-	docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder doc_maker \
-	doc-builder build optimum.habana /optimum-habana/docs/source/ \
-		--repo_name optimum-habana \
-		--build_dir $(BUILD_DIR) \
-		--version $(VERSION) \
-		--version_tag_suffix "" \
-		--html \
-		--clean
-
-clean:
-	find . -name "habana_log.livealloc.log_*" -type f -delete
-	find . -name "hl-smi_log*" -type f -delete
-	find . -name .lock -type f -delete
-	find . -name .graph_dumps -type d -exec rm -r {} +
-	find . -name save-hpu.pdb -type f -delete
-	find . -name checkpoints.json -type f -delete
-	rm -rf regression/
-	rm -rf tmp_trainer/
-	rm -rf test/
-	rm -rf build/
-	rm -rf dist/
-	rm -rf optimum_habana.egg-info/
-	rm -rf hpu_profile/
-
-test_installs:
-	python -m pip install .[tests]
diff --git a/README.md b/README.md
index fabff9e260..6f0b4d07c2 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+# :no_entry::no_entry::no_entry::no_entry::no_entry:
+# This repository is no longer used for development. It's sole purpose is to create PRs in upstream. Please use habana-internal/optimum-habana-fork
+# :no_entry::no_entry::no_entry::no_entry::no_entry:
+
 <a href="https://github.com/huggingface/optimum-habana#gh-light-mode-only">
   <img src="https://github.com/huggingface/optimum-habana/blob/main/readme_logo_light.png"/>
 </a>
@@ -22,7 +26,6 @@ limitations under the License.
   <img src="https://github.com/huggingface/optimum-habana/blob/main/readme_logo_dark.png"/>
 </a>
 
-
 # Optimum for Intel® Gaudi® Accelerators
 
 Optimum for Intel Gaudi - a.k.a. `optimum-habana` - is the interface between the Transformers and Diffusers libraries and [Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html).
@@ -249,4 +252,4 @@ After training your model, feel free to submit it to the Intel [leaderboard](htt
 
 ## Development
 
-Check the [contributor guide](https://github.com/huggingface/optimum/blob/main/CONTRIBUTING.md) for instructions.
\ No newline at end of file
+Check the [contributor guide](https://github.com/huggingface/optimum/blob/main/CONTRIBUTING.md) for instructions.
diff --git a/conftest.py b/conftest.py
deleted file mode 100644
index 71cb6bb7ca..0000000000
--- a/conftest.py
+++ /dev/null
@@ -1,25 +0,0 @@
-class Secret:
-    """
-    Taken from: https://stackoverflow.com/a/67393351
-    """
-
-    def __init__(self, value):
-        self.value = value
-
-    def __repr__(self):
-        return "Secret(********)"
-
-    def __str___(self):
-        return "*******"
-
-
-def pytest_addoption(parser):
-    parser.addoption("--token", action="store", default=None)
-
-
-def pytest_generate_tests(metafunc):
-    # This is called for every test. Only get/set command line arguments
-    # if the argument is specified in the list of test "fixturenames".
-    option_value = Secret(metafunc.config.option.token)
-    if "token" in metafunc.fixturenames:
-        metafunc.parametrize("token", [option_value])
diff --git a/docs/Dockerfile b/docs/Dockerfile
deleted file mode 100644
index a31904c957..0000000000
--- a/docs/Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-FROM vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-
-ARG commit_sha
-ARG clone_url
-
-# Need node to build doc HTML. Taken from https://stackoverflow.com/a/67491580
-RUN apt-get update && apt-get install -y \
-    software-properties-common \
-    npm
-RUN npm install n -g && \
-    n latest
-
-RUN git clone $clone_url optimum-habana && cd optimum-habana && git checkout $commit_sha
-RUN python3 -m pip install --no-cache-dir --upgrade pip
-RUN python3 -m pip install --no-cache-dir ./optimum-habana[quality]
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
deleted file mode 100644
index aa79f0df2e..0000000000
--- a/docs/source/_toctree.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-- sections:
-  - local: index
-    title: 🤗 Optimum Habana
-  - local: installation
-    title: Installation
-  - local: quickstart
-    title: Quickstart
-  - sections:
-    - local: tutorials/overview
-      title: Overview
-    - local: tutorials/single_hpu
-      title: Single-HPU Training
-    - local: tutorials/distributed
-      title: Distributed Training
-    - local: tutorials/inference
-      title: Run Inference
-    - local: tutorials/stable_diffusion
-      title: Stable Diffusion
-    - local: tutorials/stable_diffusion_ldm3d
-      title: LDM3D
-    title: Tutorials
-  - sections:
-    - local: usage_guides/overview
-      title: Overview
-    - local: usage_guides/pretraining
-      title: Pretraining Transformers
-    - local: usage_guides/accelerate_training
-      title: Accelerating Training
-    - local: usage_guides/accelerate_inference
-      title: Accelerating Inference
-    - local: usage_guides/deepspeed
-      title: How to use DeepSpeed
-    - local: usage_guides/multi_node_training
-      title: Multi-node Training
-    title: How-To Guides
-  - sections:
-    - local: concept_guides/hpu
-      title: What are Habana's Gaudi and HPUs?
-    title: Conceptual Guides
-  - sections:
-    - local: package_reference/trainer
-      title: Gaudi Trainer
-    - local: package_reference/gaudi_config
-      title: Gaudi Configuration
-    - local: package_reference/stable_diffusion_pipeline
-      title: Gaudi Stable Diffusion Pipeline
-    - local: package_reference/distributed_runner
-      title: Distributed Runner
-    title: Reference
-  title: Optimum Habana
-  isExpanded: false
diff --git a/docs/source/concept_guides/hpu.mdx b/docs/source/concept_guides/hpu.mdx
deleted file mode 100644
index 111f8be903..0000000000
--- a/docs/source/concept_guides/hpu.mdx
+++ /dev/null
@@ -1,49 +0,0 @@
-<!---
-Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# What are Intel® Gaudi® 1, Intel® Gaudi® 2 and HPUs?
-
-[Intel Gaudi 1](https://habana.ai/training/gaudi/) and [Intel Gaudi 2](https://habana.ai/training/gaudi2/) are the first- and second-generation AI hardware accelerators designed by Habana Labs and Intel.
-A single server contains 8 devices called Habana Processing Units (HPUs) with 96GB of memory each on Gaudi2 and 32GB on first-gen Gaudi.
-Check out [here](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html) for more information about the underlying hardware architecture.
-
-The Habana SDK is called [SynapseAI](https://docs.habana.ai/en/latest/index.html) and is common to both first-gen Gaudi and Gaudi2.
-As a consequence, 🤗 Optimum Habana is fully compatible with both generations of accelerators.
-
-
-## Execution modes
-
-Two execution modes are supported on HPUs for PyTorch, which is the main deep learning framework the 🤗 Transformers and 🤗 Diffusers libraries rely on:
-
-- *Eager mode* execution, where the framework executes one operation at a time as defined in [standard PyTorch eager mode](https://pytorch.org/tutorials/beginner/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.html).
-- *Lazy mode* execution, where operations are internally accumulated in a graph. The execution of the operations in the accumulated graph is triggered in a lazy manner, only when a tensor value is required by the user or when it is explicitly required in the script. The [SynapseAI graph compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/SynapseAI_Software_Suite.html#graph-compiler-and-runtime) will optimize the execution of the operations accumulated in the graph (e.g. operator fusion, data layout management, parallelization, pipelining and memory management, graph-level optimizations).
-
-See [here](../usage_guides/accelerate_training#lazy-mode) how to use these execution modes in Optimum for Intel Gaudi.
-
-
-## Distributed training
-
-First-gen Gaudi and Gaudi2 are well-equipped for distributed training:
-
-- *Scale-up* to 8 devices on one server. See [here](../tutorials/distributed) how to perform distributed training on a single node.
-- *Scale-out* to 1000s of devices on several servers. See [here](../usage_guides/multi_node_training) how to do multi-node training.
-
-
-## Inference
-
-HPUs can also be used to perform inference:
-- Through HPU graphs that are well-suited for latency-sensitive applications. Check out [here](../usage_guides/accelerate_inference) how to use them.
-- In lazy mode, which can be used the same way as for training.
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
deleted file mode 100644
index b33cfd062e..0000000000
--- a/docs/source/index.mdx
+++ /dev/null
@@ -1,119 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-
-# Optimum for Intel Gaudi
-
-Optimum for Intel Gaudi is the interface between the Transformers and Diffusers libraries and [Intel® Gaudi® AI Accelerators (HPUs)](https://docs.habana.ai/en/latest/index.html).
-It provides a set of tools that enable easy model loading, training and inference on single- and multi-HPU settings for various downstream tasks as shown in the table below.
-
-HPUs offer fast model training and inference as well as a great price-performance ratio.
-Check out [this blog post about BERT pre-training](https://huggingface.co/blog/pretraining-bert) and [this post benchmarking Intel Gaudi 2 with NVIDIA A100 GPUs](https://huggingface.co/blog/habana-gaudi-2-benchmark) for concrete examples.
-If you are not familiar with HPUs, we recommend you take a look at [our conceptual guide](./concept_guides/hpu).
-
-
-The following model architectures, tasks and device distributions have been validated for Optimum for Intel Gaudi:
-
-<Tip>
-
-In the tables below, ✅ means single-card, multi-card and DeepSpeed have all been validated.
-
-</Tip>
-
-- Transformers
-
-| Architecture | Training | Inference | Tasks |
-|--------------|:--------:|:---------:|:------|
-| BERT         | ✅       | ✅        | <li>[text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li> |
-| RoBERTa      | ✅       | ✅        | <li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li> |
-| ALBERT       | ✅       | ✅        | <li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li> |
-| DistilBERT   | ✅       | ✅        | <li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li> |
-| GPT2         | ✅       | ✅        | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| BLOOM(Z)     |          | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| StarCoder / StarCoder2 | ✅ | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| GPT-J        | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| GPT-NeoX     | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| OPT          |          | <div style="text-align:left"><li>DeepSpeed</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Llama 2 / CodeLlama / Llama 3 / Llama Guard | ✅ | ✅ | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)</li><li>[text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification) (Llama Guard)</li> |
-| StableLM     |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Falcon       | <div style="text-align:left"><li>LoRA</li></div> | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| CodeGen      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| MPT          |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Mistral      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Phi          | ✅       | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Mixtral      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Gemma        | ✅       | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Qwen2        | <div style="text-align:left"><li>Single card</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| Persimmon    |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| T5 / Flan T5 | ✅       | ✅        | <li>[summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization)</li><li>[translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering#fine-tuning-t5-on-squad20)</li> |
-| BART         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization)</li><li>[translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation)</li><li>[question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering#fine-tuning-t5-on-squad20)</li> |
-| ViT          | ✅       | ✅        | <li>[image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)</li> |
-| Swin         | ✅       | ✅        | <li>[image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)</li> |
-| Wav2Vec2     | ✅       | ✅        | <li>[audio classification](https://github.com/huggingface/optimum-habana/tree/main/examples/audio-classification)</li><li>[speech recognition](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition)</li> |
-| Whisper      | ✅       | ✅        | <li>[speech recognition](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition)</li> |
-| SpeechT5     |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text to speech](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-speech)</li> |
-| CLIP         | ✅       | ✅        | <li>[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)</li> |
-| BridgeTower  | ✅       | ✅        | <li>[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)</li> |
-| ESMFold      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)</li> |
-| Blip         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)</li><li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-| OWLViT       |          | <div style="text-align:left"><li>Single card</li></div> | <li>[zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)</li> |
-| ClipSeg      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
-| Llava / Llava-next |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
-
-
-- Diffusers
-
-| Architecture        | Training | Inference | Tasks |
-|---------------------|:--------:|:---------:|:------|
-| Stable Diffusion    | <li>[textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)</li><li>[ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)</li> | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| Stable Diffusion XL | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)</li> | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-| LDM3D               |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
-
-
-- TRL:
-
-| Architecture     | Training | Inference            | Tasks |
-|------------------|:--------:|:--------------------:|:------|
-| Llama 2          | ✅       |           | <li>[DPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)</li> |
-| Llama 2          | ✅       |           | <li>[PPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)</li> |
-| Stable Diffusion | ✅       |           | <li>[DDPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)</li> |
-
-
-Other models and tasks supported by the 🤗 Transformers and 🤗 Diffusers library may also work.
-You can refer to this [section](https://github.com/huggingface/optimum-habana#how-to-use-it) for using them with 🤗 Optimum Habana.
-Besides, [this page](https://github.com/huggingface/optimum-habana/tree/main/examples) explains how to modify any [example](https://github.com/huggingface/transformers/tree/main/examples/pytorch) from the 🤗 Transformers library to make it work with 🤗 Optimum Habana.
-
-
-<div class="mt-10">
-  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./tutorials/overview"
-      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
-      <p class="text-gray-700">Learn the basics and become familiar with training transformers on HPUs with 🤗 Optimum. Start here if you are using 🤗 Optimum Habana for the first time!</p>
-    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./usage_guides/overview"
-      ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
-      <p class="text-gray-700">Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use 🤗 Optimum Habana to solve real-world problems.</p>
-    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./concept_guides/hpu"
-      ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
-      <p class="text-gray-700">High-level explanations for building a better understanding of important topics such as HPUs.</p>
-   </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./package_reference/trainer"
-      ><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
-      <p class="text-gray-700">Technical descriptions of how the Habana classes and methods of 🤗 Optimum Habana work.</p>
-    </a>
-  </div>
-</div>
\ No newline at end of file
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
deleted file mode 100644
index 3d657260f0..0000000000
--- a/docs/source/installation.mdx
+++ /dev/null
@@ -1,28 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Installation
-
-To install Optimum for Intel Gaudi, you first need to install SynapseAI and the Intel® Gaudi® drivers by following the official [installation guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html).
-Then, Optimum for Intel Gaudi can be installed using `pip` as follows:
-
-```bash
-python -m pip install --upgrade-strategy eager optimum[habana]
-```
-
-
-To use DeepSpeed on HPUs, you also need to run the following command:
-
-```bash
-python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-```
-
diff --git a/docs/source/package_reference/distributed_runner.mdx b/docs/source/package_reference/distributed_runner.mdx
deleted file mode 100644
index 388da28c1a..0000000000
--- a/docs/source/package_reference/distributed_runner.mdx
+++ /dev/null
@@ -1,20 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# DistributedRunner
-
-[[autodoc]] distributed.distributed_runner.DistributedRunner
-    - all
\ No newline at end of file
diff --git a/docs/source/package_reference/gaudi_config.mdx b/docs/source/package_reference/gaudi_config.mdx
deleted file mode 100644
index 1060e9c64e..0000000000
--- a/docs/source/package_reference/gaudi_config.mdx
+++ /dev/null
@@ -1,53 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# GaudiConfig
-
-Here is a description of each configuration parameter:
-- `use_fused_adam` enables to decide whether to use the [custom fused implementation of the ADAM optimizer provided by Intel® Gaudi® AI Accelerator](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Custom_Ops_PyTorch.html#custom-optimizers).
-- `use_fused_clip_norm` enables to decide whether to use the [custom fused implementation of gradient norm clipping provided by Intel® Gaudi® AI Accelerator](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Custom_Ops_PyTorch.html#other-custom-ops).
-- `use_torch_autocast` enables PyTorch autocast; used to define good pre-defined config; users should favor `--bf16` training argument
-- `autocast_bf16_ops` list of operations that should be run with bf16 precision under autocast context; using environment flag LOWER_LIST is a preffered way for operator autocast list override
-- `autocast_fp32_ops` list of operations that should be run with fp32 precision under autocast context; using environment flag FP32_LIST is a preffered way for operator autocast list override
-
-
-You can find examples of Gaudi configurations in the [Habana model repository on the Hugging Face Hub](https://huggingface.co/habana). For instance, [for BERT Large we have](https://huggingface.co/Habana/bert-large-uncased-whole-word-masking/blob/main/gaudi_config.json):
-
-```JSON
-{
-  "use_fused_adam": true,
-  "use_fused_clip_norm": true,
-}
-```
-
-To instantiate yourself a Gaudi configuration in your script, you can do the following
-```python
-from optimum.habana import GaudiConfig
-
-gaudi_config = GaudiConfig.from_pretrained(
-    gaudi_config_name,
-    cache_dir=model_args.cache_dir,
-    revision=model_args.model_revision,
-    token=model_args.token,
-)
-```
-and pass it to the trainer with the `gaudi_config` argument.
-
-
-## GaudiConfig
-
-[[autodoc]] transformers.gaudi_configuration.GaudiConfig
-    - all
diff --git a/docs/source/package_reference/stable_diffusion_pipeline.mdx b/docs/source/package_reference/stable_diffusion_pipeline.mdx
deleted file mode 100644
index e493bdb8d3..0000000000
--- a/docs/source/package_reference/stable_diffusion_pipeline.mdx
+++ /dev/null
@@ -1,77 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# GaudiStableDiffusionPipeline
-
-The `GaudiStableDiffusionPipeline` class enables to perform text-to-image generation on HPUs.
-It inherits from the `GaudiDiffusionPipeline` class that is the parent to any kind of diffuser pipeline.
-
-To get the most out of it, it should be associated with a scheduler that is optimized for HPUs like `GaudiDDIMScheduler`.
-
-
-## GaudiStableDiffusionPipeline
-
-[[autodoc]] diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipeline
-    - __call__
-
-
-## GaudiDiffusionPipeline
-
-[[autodoc]] diffusers.pipelines.pipeline_utils.GaudiDiffusionPipeline
-    - all
-
-
-## GaudiDDIMScheduler
-
-[[autodoc]] diffusers.schedulers.scheduling_ddim.GaudiDDIMScheduler
-    - all
-
-
-# GaudiStableDiffusionXLPipeline
-
-The `GaudiStableDiffusionXLPipeline` class enables to perform text-to-image generation on HPUs using SDXL models.
-It inherits from the `GaudiDiffusionPipeline` class that is the parent to any kind of diffuser pipeline.
-
-To get the most out of it, it should be associated with a scheduler that is optimized for HPUs like `GaudiDDIMScheduler`.
-Recommended schedulers are `GaudiEulerDiscreteScheduler` for SDXL base and `GaudiEulerAncestralDiscreteScheduler` for SDXL turbo.
-
-
-## GaudiStableDiffusionXLPipeline
-
-[[autodoc]] diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.GaudiStableDiffusionXLPipeline
-    - __call__
-
-
-## GaudiEulerDiscreteScheduler
-
-[[autodoc]] diffusers.schedulers.scheduling_euler_discrete.GaudiEulerDiscreteScheduler
-    - all
-
-
-## GaudiEulerAncestralDiscreteScheduler
-
-[[autodoc]] diffusers.schedulers.scheduling_euler_ancestral_discrete.GaudiEulerAncestralDiscreteScheduler
-    - all
-
-
-# GaudiStableDiffusionUpscalePipeline
-
-The `GaudiStableDiffusionUpscalePipeline` is used to enhance the resolution of input images by a factor of 4 on HPUs.
-It inherits from the `GaudiDiffusionPipeline` class that is the parent to any kind of diffuser pipeline.
-
-
-[[autodoc]] diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.GaudiStableDiffusionUpscalePipeline
-    - __call__
diff --git a/docs/source/package_reference/trainer.mdx b/docs/source/package_reference/trainer.mdx
deleted file mode 100644
index e584294fc5..0000000000
--- a/docs/source/package_reference/trainer.mdx
+++ /dev/null
@@ -1,69 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# GaudiTrainer
-
-The [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainer) class provides an extended API for the feature-complete [Transformers Trainer](https://huggingface.co/docs/transformers/main_classes/trainer). It is used in all the [example scripts](https://github.com/huggingface/optimum-habana/tree/main/examples).
-
-Before instantiating your [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainer), create a [`GaudiTrainingArguments`] object to access all the points of customization during training.
-
-<Tip warning={true}>
-
-The [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainer) class is optimized for 🤗 Transformers models running on Habana Gaudi.
-
-</Tip>
-
-Here is an example of how to customize [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainer) to use a weighted loss (useful when you have an unbalanced training set):
-
-```python
-from torch import nn
-from optimum.habana import GaudiTrainer
-
-
-class CustomGaudiTrainer(GaudiTrainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        labels = inputs.get("labels")
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.get("logits")
-        # compute custom loss (suppose one has 3 labels with different weights)
-        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
-        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
-        return (loss, outputs) if return_outputs else loss
-```
-
-Another way to customize the training loop behavior for the PyTorch [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainer) is to use [callbacks](https://huggingface.co/docs/transformers/main_classes/callback) that can inspect the training loop state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early stopping).
-
-## GaudiTrainer
-
-[[autodoc]] transformers.trainer.GaudiTrainer
-    - all
-
-## GaudiSeq2SeqTrainer
-
-[[autodoc]] transformers.trainer_seq2seq.GaudiSeq2SeqTrainer
-    - evaluate
-    - predict
-
-## GaudiTrainingArguments
-
-[[autodoc]] transformers.training_args.GaudiTrainingArguments
-    - all
-
-## GaudiSeq2SeqTrainingArguments
-
-[[autodoc]] transformers.training_args_seq2seq.GaudiSeq2SeqTrainingArguments
-    - all
diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
deleted file mode 100644
index 0690cd99c6..0000000000
--- a/docs/source/quickstart.mdx
+++ /dev/null
@@ -1,105 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-
-# Quickstart
-
-🤗 Optimum Habana was designed with one goal in mind: **making training and evaluation straightforward for any 🤗 Transformers user while leveraging the complete power of Gaudi processors**.
-There are two main classes one needs to know:
-- [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer): the trainer class that takes care of compiling (lazy or eager mode) and distributing the model to run on HPUs, and of performing training and evaluation.
-- [`GaudiConfig`](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config): the class that enables to configure Habana Mixed Precision and to decide whether optimized operators and optimizers should be used or not.
-
-The [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer) is very similar to the [🤗 Transformers Trainer](https://huggingface.co/docs/transformers/main_classes/trainer), and adapting a script using the Trainer to make it work with Gaudi will mostly consist in simply swapping the `Trainer` class for the `GaudiTrainer` one.
-That is how most of the [example scripts](https://github.com/huggingface/optimum-habana/tree/main/examples) were adapted from their [original counterparts](https://github.com/huggingface/transformers/tree/main/examples/pytorch).
-
-```diff
-- from transformers import Trainer, TrainingArguments
-+ from optimum.habana import GaudiTrainer, GaudiTrainingArguments
-
-# Define the training arguments
-- training_args = TrainingArguments(
-+ training_args = GaudiTrainingArguments(
-+   use_habana=True,
-+   use_lazy_mode=True,
-+   gaudi_config_name=gaudi_config_name,
-  ...
-)
-
-# Initialize our Trainer
-- trainer = Trainer(
-+ trainer = GaudiTrainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset
-    ... # other arguments
-)
-```
-
-where `gaudi_config_name` is the name of a model from the [Hub](https://huggingface.co/Habana) (Gaudi configurations are stored in model repositories) or a path to a local Gaudi configuration file (you can see [here](./package_reference/gaudi_config) how to write your own).
-
-
-## Stable Diffusion
-
-🤗 Optimum Habana also features HPU-optimized support for the 🤗 Diffusers library.
-Thus, you can easily deploy Stable Diffusion on Gaudi for performing text-to-image generation.
-
-Here is how to use it and the differences with the 🤗 Diffusers library:
-```diff
-- from diffusers import DDIMScheduler, StableDiffusionPipeline
-+ from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
-
-
-model_name = "runwayml/stable-diffusion-v1-5"
-
-- scheduler = DDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-+ scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-- pipeline = StableDiffusionPipeline.from_pretrained(
-+ pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-    model_name,
-    scheduler=scheduler,
-+   use_habana=True,
-+   use_hpu_graphs=True,
-+   gaudi_config="Habana/stable-diffusion",
-)
-
-outputs = pipeline(
-    ["An image of a squirrel in Picasso style"],
-    num_images_per_prompt=16,
-+   batch_size=4,
-)
-```
-
-
-## Ready-to-Use Examples
-
-Here are examples for various modalities and tasks that can be used out of the box:
-- Text
-  - [text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification),
-  - [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering),
-  - [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling),
-  - [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation),
-  - [summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization),
-  - [translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation),
-  - [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)
-- Images
-  - [image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)
-- Audio
-  - [audio classification](https://github.com/huggingface/optimum-habana/tree/main/examples/audio-classification),
-  - [speech recognition](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition)
-- Text and images
-  - [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion),
-  - [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text).
diff --git a/docs/source/tutorials/distributed.mdx b/docs/source/tutorials/distributed.mdx
deleted file mode 100644
index dd81e8da52..0000000000
--- a/docs/source/tutorials/distributed.mdx
+++ /dev/null
@@ -1,63 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Distributed training with Optimum Habana
-
-As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude.
-
-All the [PyTorch examples](https://github.com/huggingface/optimum-habana/tree/main/examples) and the [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer) script work out of the box with distributed training.
-There are two ways of launching them:
-
-1. Using the [gaudi_spawn.py](https://github.com/huggingface/optimum-habana/blob/main/examples/gaudi_spawn.py) script:
-
-```bash
-python gaudi_spawn.py \
-    --world_size number_of_hpu_you_have --use_mpi \
-    path_to_script.py --args1 --args2 ... --argsN
-```
-where `--argX` is an argument of the script to run in a distributed way.
-Examples are given for question answering [here](https://github.com/huggingface/optimum-habana/blob/main/examples/question-answering/README.md#multi-card-training) and text classification [here](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification#multi-card-training).
-
-2. Using the [`DistributedRunner`](https://huggingface.co/docs/optimum/habana/package_reference/distributed_runner) directly in code:
-
-```python
-from optimum.habana.distributed import DistributedRunner
-from optimum.utils import logging
-
-world_size=8 # Number of HPUs to use (1 or 8)
-
-# define distributed runner
-distributed_runner = DistributedRunner(
-    command_list=["scripts/train.py --args1 --args2 ... --argsN"],
-    world_size=world_size,
-    use_mpi=True,
-)
-
-# start job
-ret_code = distributed_runner.run()
-```
-
-<Tip>
-
-You can set the training argument `--distribution_strategy fast_ddp` for simpler and usually faster distributed training management. More information [here](../usage_guides/accelerate_training#fast-ddp).
-
-</Tip>
-
-To go further, we invite you to read our guides about:
-- [Accelerating training](../usage_guides/accelerate_training)
-- [Pretraining](../usage_guides/pretraining)
-- [DeepSpeed](../usage_guides/deepspeed) to train bigger models
-- [Multi-node training](../usage_guides/multi_node_training) to speed up even more your distributed runs
diff --git a/docs/source/tutorials/inference.mdx b/docs/source/tutorials/inference.mdx
deleted file mode 100644
index a7cb9b5f25..0000000000
--- a/docs/source/tutorials/inference.mdx
+++ /dev/null
@@ -1,72 +0,0 @@
-<!---
-Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Run Inference
-
-This section shows how to run inference-only workloads on Gaudi.
-For more advanced information about how to speed up inference, check out [this guide](../usage_guides/accelerate_inference).
-
-
-## With GaudiTrainer
-
-You can find below a template to perform inference with a `GaudiTrainer` instance where we want to compute the accuracy over the given dataset:
-
-```python
-import evaluate
-
-metric = evaluate.load("accuracy")
-
-# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
-# predictions and label_ids field) and has to return a dictionary string to float.
-def my_compute_metrics(p):
-    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
-
-# Trainer initialization
-trainer = GaudiTrainer(
-        model=my_model,
-        gaudi_config=my_gaudi_config,
-        args=my_args,
-        train_dataset=None,
-        eval_dataset=eval_dataset,
-        compute_metrics=my_compute_metrics,
-        tokenizer=my_tokenizer,
-        data_collator=my_data_collator,
-    )
-
-# Run inference
-metrics = trainer.evaluate()
-```
-
-The variable `my_args` should contain some inference-specific arguments, you can take a look [here](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.set_evaluate) to see the arguments that can be interesting to set for inference.
-
-
-## In our Examples
-
-All [our examples](https://github.com/huggingface/optimum-habana/tree/main/examples) contain instructions for running inference with a given model on a given dataset.
-The reasoning is the same for every example: run the example script with `--do_eval` and `--per_device_eval_batch_size` and without `--do_train`.
-A simple template is the following:
-```bash
-python path_to_the_example_script \
-  --model_name_or_path my_model_name \
-  --gaudi_config_name my_gaudi_config_name \
-  --dataset_name my_dataset_name \
-  --do_eval \
-  --per_device_eval_batch_size my_batch_size \
-  --output_dir path_to_my_output_dir \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference
-```
diff --git a/docs/source/tutorials/overview.mdx b/docs/source/tutorials/overview.mdx
deleted file mode 100644
index 7e8b306d1d..0000000000
--- a/docs/source/tutorials/overview.mdx
+++ /dev/null
@@ -1,24 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Overview
-
-Welcome to the 🤗 Optimum Habana tutorials!
-They will help you to get started quickly on the following topics:
-- How to [train a model on a single device](./single_hpu)
-- How to [train a model on several devices](./distributed)
-- How to [run inference with your model](./inference)
-- How to [generate images from text with Stable Diffusion](./stable_diffusion)
diff --git a/docs/source/tutorials/single_hpu.mdx b/docs/source/tutorials/single_hpu.mdx
deleted file mode 100644
index 575a334a6a..0000000000
--- a/docs/source/tutorials/single_hpu.mdx
+++ /dev/null
@@ -1,26 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Single-HPU Training
-
-Training on a single device is as simple as in Transformers:
-- You need to replace the Transformers' [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer) class with the [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer) class,
-- You need to replace the Transformers' [`TrainingArguments`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) class with the [`GaudiTrainingArguments`] class and add the following arguments:
-    - `use_habana` to execute your script on an HPU,
-    - `use_lazy_mode` to use lazy mode (recommended) or not (i.e. eager mode),
-    - `gaudi_config_name` to give the name of (Hub) or the path to (local) your Gaudi configuration file.
-
-To go further, we invite you to read our guides about [accelerating training](../usage_guides/accelerate_training) and [pretraining](../usage_guides/pretraining).
diff --git a/docs/source/tutorials/stable_diffusion.mdx b/docs/source/tutorials/stable_diffusion.mdx
deleted file mode 100644
index c662005a5f..0000000000
--- a/docs/source/tutorials/stable_diffusion.mdx
+++ /dev/null
@@ -1,183 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Stable Diffusion
-
-Stable Diffusion is a text-to-image latent diffusion model.
-Check out this [blog post](https://huggingface.co/blog/stable_diffusion) for more information.
-
-
-## How to generate images?
-
-To generate images with Stable Diffusion on Gaudi, you need to instantiate two instances:
-- A pipeline with [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline). This pipeline supports *text-to-image generation*.
-- A scheduler with [`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler). This scheduler has been optimized for Gaudi.
-
-When initializing the pipeline, you have to specify `use_habana=True` to deploy it on HPUs.
-Furthermore, to get the fastest possible generations you should enable **HPU graphs** with `use_hpu_graphs=True`.
-Finally, you will need to specify a [Gaudi configuration](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) which can be downloaded from the Hugging Face Hub.
-
-```python
-from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
-
-model_name = "runwayml/stable-diffusion-v1-5"
-
-scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-    model_name,
-    scheduler=scheduler,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config="Habana/stable-diffusion",
-)
-```
-
-You can then call the pipeline to generate images from one or several prompts:
-```python
-outputs = pipeline(
-    prompt=["High quality photo of an astronaut riding a horse in space", "Face of a yellow cat, high resolution, sitting on a park bench"],
-    num_images_per_prompt=10,
-    batch_size=4,
-    output_type="pil",
-)
-```
-
-Outputs can be PIL images or Numpy arrays. See [here](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.StableDiffusionPipeline.__call__) all the parameters you can set to tailor generations to your taste.
-
-<Tip>
-
-Check out the [example](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion) provided in the official Github repository.
-
-</Tip>
-
-
-## Stable Diffusion 2
-
-[Stable Diffusion 2](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion_2) can be used with the exact same classes.
-Here is an example:
-
-```python
-from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
-
-
-model_name = "stabilityai/stable-diffusion-2-1"
-
-scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-    model_name,
-    scheduler=scheduler,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config="Habana/stable-diffusion-2",
-)
-
-outputs = pipeline(
-    ["An image of a squirrel in Picasso style"],
-    num_images_per_prompt=10,
-    batch_size=2,
-    height=768,
-    width=768,
-)
-```
-
-<Tip>
-
-There are two different checkpoints for Stable Diffusion 2:
-
-- use [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) for generating 768x768 images
-- use [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) for generating 512x512 images
-
-</Tip>
-
-
-## Super-resolution
-
-The Stable Diffusion upscaler diffusion model was created by the researchers and engineers from CompVis, Stability AI, and LAION. It is used to enhance the resolution of input images by a factor of 4.
-
-See [here](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/upscale) for more information.
-
-### How to upscale low resolution images?
-
-To generate RGB and depth images with Stable Diffusion Upscale on Gaudi, you need to instantiate two instances:
-- A pipeline with [`GaudiStableDiffusionUpscalePipeline`](../package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiStableDiffusionUpscalePipeline).
-- A scheduler with [`GaudiDDIMScheduler`](../package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler). This scheduler has been optimized for Gaudi.
-
-When initializing the pipeline, you have to specify `use_habana=True` to deploy it on HPUs.
-Furthermore, to get the fastest possible generations you should enable **HPU graphs** with `use_hpu_graphs=True`.
-Finally, you will need to specify a [Gaudi configuration](../package_reference/gaudi_config) which can be downloaded from the Hugging Face Hub.
-
-```python
-import requests
-from io import BytesIO
-from optimum.habana.diffusers import (
-    GaudiDDIMScheduler,
-    GaudiStableDiffusionUpscalePipeline,
-)
-from optimum.habana.utils import set_seed
-from PIL import Image
-
-set_seed(42)
-
-model_name_upscale = "stabilityai/stable-diffusion-x4-upscaler"
-scheduler = GaudiDDIMScheduler.from_pretrained(model_name_upscale, subfolder="scheduler")
-url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
-response = requests.get(url)
-low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
-low_res_img = low_res_img.resize((128, 128))
-low_res_img.save("low_res_cat.png")
-prompt = "a white cat"
-
-pipeline = GaudiStableDiffusionUpscalePipeline.from_pretrained(
-    model_name_upscale,
-    scheduler=scheduler,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config="Habana/stable-diffusion",
-)
-upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
-upscaled_image.save("upsampled_cat.png")
-
-```
-
-
-## Tips
-
-To accelerate your Stable Diffusion pipeline, you can run it in full *bfloat16* precision.
-This will also save memory.
-You just need to pass `torch_dtype=torch.bfloat16` to `from_pretrained` when instantiating your pipeline.
-Here is how to do it:
-
-```py
-import torch
-
-pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    scheduler=scheduler,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config="Habana/stable-diffusion",
-    torch_dtype=torch.bfloat16
-)
-```
-
-
-## Textual Inversion Fine-Tuning
-
-[Textual Inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like Stable Diffusion on your own images using just 3-5 examples.
-
-You can find [here](https://github.com/huggingface/optimum-habana/blob/main/examples/stable-diffusion/textual_inversion.py) an example script that implements this training method.
diff --git a/docs/source/tutorials/stable_diffusion_ldm3d.mdx b/docs/source/tutorials/stable_diffusion_ldm3d.mdx
deleted file mode 100644
index d7c975ceb6..0000000000
--- a/docs/source/tutorials/stable_diffusion_ldm3d.mdx
+++ /dev/null
@@ -1,67 +0,0 @@
-<!---
-Copyright 2022 The Intel authors Team and HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Text-to-(RGB, depth)
-
-LDM3D was proposed in [LDM3D: Latent Diffusion Model for 3D](https://huggingface.co/papers/2305.10853) by Gabriela Ben Melech Stan, Diana Wofk, Scottie Fox, Alex Redden, Will Saxton, Jean Yu, Estelle Aflalo, Shao-Yen Tseng, Fabio Nonato, Matthias Muller, and Vasudev Lal. LDM3D generates an image and a depth map from a given text prompt unlike the existing text-to-image diffusion models such as [Stable Diffusion](./stable_diffusion) which only generates an image. With almost the same number of parameters, LDM3D achieves to create a latent space that can compress both the RGB images and the depth maps. 
-
-The abstract from the paper is:
-
-*This research paper proposes a Latent Diffusion Model for 3D (LDM3D) that generates both image and depth map data from a given text prompt, allowing users to generate RGBD images from text prompts. The LDM3D model is fine-tuned on a dataset of tuples containing an RGB image, depth map and caption, and validated through extensive experiments. We also develop an application called DepthFusion, which uses the generated RGB images and depth maps to create immersive and interactive 360-degree-view experiences using TouchDesigner. This technology has the potential to transform a wide range of industries, from entertainment and gaming to architecture and design. Overall, this paper presents a significant contribution to the field of generative AI and computer vision, and showcases the potential of LDM3D and DepthFusion to revolutionize content creation and digital experiences. A short video summarizing the approach can be found at [this url](https://t.ly/tdi2).*
-
-
-## How to generate RGB and depth images?
-
-To generate RGB and depth images with Stable Diffusion LDM3D on Gaudi, you need to instantiate two instances:
-- A pipeline with [`GaudiStableDiffusionLDM3DPipeline`]. This pipeline supports *text-to-(rgb, depth) generation*.
-- A scheduler with [`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler). This scheduler has been optimized for Gaudi.
-
-When initializing the pipeline, you have to specify `use_habana=True` to deploy it on HPUs.
-Furthermore, to get the fastest possible generations you should enable **HPU graphs** with `use_hpu_graphs=True`.
-Finally, you will need to specify a [Gaudi configuration](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) which can be downloaded from the Hugging Face Hub.
-
-```python
-from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionLDM3DPipeline
-from optimum.habana.utils import set_seed
-
-model_name = "Intel/ldm3d-4c"
-
-scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-set_seed(42)
-
-pipeline = GaudiStableDiffusionLDM3DPipeline.from_pretrained(
-    model_name,
-    scheduler=scheduler,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config="Habana/stable-diffusion",
-)
-outputs = pipeline(
-    prompt=["High quality photo of an astronaut riding a horse in space"],
-    num_images_per_prompt=1,
-    batch_size=1,
-    output_type="pil",
-    num_inference_steps=40,
-    guidance_scale=5.0,
-    negative_prompt=None
-)
-
-
-rgb_image, depth_image = outputs.rgb, outputs.depth
-rgb_image[0].save("astronaut_ldm3d_rgb.png")
-depth_image[0].save("astronaut_ldm3d_depth.png")
-```
diff --git a/docs/source/usage_guides/accelerate_inference.mdx b/docs/source/usage_guides/accelerate_inference.mdx
deleted file mode 100644
index be113daf32..0000000000
--- a/docs/source/usage_guides/accelerate_inference.mdx
+++ /dev/null
@@ -1,102 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Accelerating Inference
-
-Gaudi offers several possibilities to make inference faster.
-
-
-## Lazy Mode
-
-Two execution modes are proposed:
-- *Lazy mode*, where operations are accumulated in a graph whose execution is triggered in a lazy manner. This allows the graph compiler to optimize the device execution for these operations.
-- *Eager mode*, where one operation at a time is executed.
-
-In lazy mode, the graph compiler generates optimized binary code that implements the given model topology on Gaudi. It performs operator fusion, data layout management, parallelization, pipelining and memory management, as well as graph-level optimizations.
-
-To execute inference in lazy mode, you must provide the following arguments:
-```python
-args = GaudiTrainingArguments(
-    # same arguments as in Transformers,
-    use_habana=True,
-    use_lazy_mode=True,
-)
-```
-
-<Tip>
-
-In lazy mode, the last batch may trigger an extra compilation because it could be smaller than previous batches.
-To avoid this, you can discard the last batch with `dataloader_drop_last=True`.
-
-</Tip>
-
-
-## HPU Graphs
-
-Gaudi provides a way to run fast inference with HPU Graphs.
-It consists in capturing a series of operations (i.e. graphs) in an HPU stream and then replaying them in an optimized way (more information [here](https://docs.habana.ai/en/latest/PyTorch/Inference_on_Gaudi/Inference_using_HPU_Graphs/Inference_using_HPU_Graphs.html)).
-Thus, you can apply this to the `forward` method of your model to run it efficiently at inference.
-
-HPU Graphs are integrated into the [`GaudiTrainer`](../package_reference/trainer) and the [`GaudiStableDiffusionPipeline`](../package_reference/stable_diffusion_pipeline) so that one can use them very easily:
-- `GaudiTrainer` needs the training argument `use_hpu_graphs_for_inference` to be set to `True` as follows:
-```python
-from optimum.habana import GaudiTrainer, GaudiTrainingArguments
-
-# define the training arguments
-training_args = GaudiTrainingArguments(
-    use_habana=True,
-    use_lazy_mode=True,
-    use_hpu_graphs_for_inference=True,
-    gaudi_config_name=gaudi_config_name,
-    ...
-)
-
-# Initialize our Trainer
-trainer = GaudiTrainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset
-    ... # other arguments
-)
-```
-- `GaudiStableDiffusionPipeline` needs its argument `use_hpu_graphs` to be set to `True` such as:
-```python
-from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
-
-model_name = "runwayml/stable-diffusion-v1-5"
-
-scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-    model_name,
-    scheduler=scheduler,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config="Habana/stable-diffusion",
-)
-
-outputs = generator(
-    ["An image of a squirrel in Picasso style"],
-    num_images_per_prompt=16,
-    batch_size=4,
-)
-```
-
-<Tip warning={true}>
-
-With HPU Graphs and in lazy mode, the *first couple of iterations* may be slower due to graph compilations.
-
-</Tip>
diff --git a/docs/source/usage_guides/accelerate_training.mdx b/docs/source/usage_guides/accelerate_training.mdx
deleted file mode 100644
index dd2f1530d4..0000000000
--- a/docs/source/usage_guides/accelerate_training.mdx
+++ /dev/null
@@ -1,184 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Accelerating Training
-
-Gaudi offers several possibilities to make training faster.
-They are all compatible with each other and can be coupled with [distributed training](https://huggingface.co/docs/optimum/habana/usage_guides/distributed).
-
-
-## Lazy Mode
-
-Two execution modes are proposed:
-- *Lazy mode*, where operations are accumulated in a graph whose execution is triggered in a lazy manner. This allows the graph compiler to optimize the device execution for these operations.
-- *Eager mode*, where one operation at a time is executed.
-
-In lazy mode, the graph compiler generates optimized binary code that implements the given model topology on Gaudi. It performs operator fusion, data layout management, parallelization, pipelining and memory management, as well as graph-level optimizations.
-
-To execute your training in lazy mode, you must provide the following training arguments:
-```python
-args = GaudiTrainingArguments(
-    # same arguments as in Transformers,
-    use_habana=True,
-    use_lazy_mode=True,
-    gaudi_config_name=path_to_my_gaudi_config
-)
-```
-
-<Tip>
-
-In lazy mode, the last batch is filled with extra samples by default so that it has the same dimensions as previous batches.
-This enables to avoid extra graph compilations during training.
-You can also discard the last batch with `dataloader_drop_last=True`.
-
-</Tip>
-
-<Tip>
-
-In lazy mode, the first two or three training iterations may be slower due to graph compilations.
-To not take them into account in the computation of the throughput at the end of the training, you can add the following training argument: `throughput_warmup_steps=3`.
-
-</Tip>
-
-
-## Mixed-Precision Training
-
-Mixed-precision training enables to compute some operations using lighter data types to accelerate training.
-Optimum Habana enables mixed precision training in a similar fashion as 🤗 Transformers:
-- argument `--bf16` enables usage of PyTorch autocast
-- argument `--half_precision_backend [hpu_amp, cpu_amp]` is used to specify a device on which mixed precision operations should be performed
-
-
-<Tip warning={true}>
-
-Please refer to the [advanced autocast usage on Gaudi](https://docs.habana.ai/en/latest/PyTorch/PyTorch_Mixed_Precision/Autocast.html) for more informations regarding:
-- default autocast operations
-- default autocast operations override
-
-</Tip>
-
-
-## HPU Graphs
-
-The flexibility of PyTorch comes at a price - usually the same pythonic logic is processed every training step over and over.
-This may lead to a situation where it takes longer for CPU to schedule the work on Habana accelerator than it is effectively computed by it.
-To cope with such host-bound workloads, you may want to try enabling the _HPU Graphs_ feature, which records the computational graph once, then only triggers it for execution much faster multiple times.
-
-To do so, specify [`--use_hpu_graphs_for_training True`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainingArguments.use_hpu_graphs_for_training).
-This option will wrap the model in [`habana_frameworks.torch.hpu.ModuleCacher`](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/HPU_Graphs_Training.html#training-loop-with-modulecacher), which automatically records _HPU Graphs_ on the model's usage.
-
-For multi-worker distributed training, you also need to specify [`--distribution_strategy fast_ddp`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainingArguments.distribution_strategy).
-This option replaces the usage of `torch.nn.parallel.DistributedDataParallel` with much simpler and usually faster `optimum.habana.distributed.all_reduce_gradients`.
-
-<Tip warning={true}>
-
-Use with caution: currently using HPU Graphs for training may not support all the possible cases.
-However, the potential performance gain could be dramatic!
-
-</Tip>
-
-
-## Fast DDP
-
-For distributed training on several devices, you can also specify [`--distribution_strategy fast_ddp`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainingArguments.distribution_strategy).
-This option replaces the usage of `torch.nn.parallel.DistributedDataParallel` with much simpler and usually faster `optimum.habana.distributed.all_reduce_gradients`.
-
-
-## Pipelining Forward and Backward Passes
-
-There are two stages when running models on Habana HPU: python code interpretation on CPU and HPU recipe computation.
-The HPU computation stage can be triggered manually or when a copy to the CPU is requested, and generally HPU computation is triggered after `loss.backward()` to make the CPU code interpretation and HPU recipe computation overlap as shown in the following illustration:
-
-```
-CPU:...forward + backward   ...optimizer  ...forward + backward   ...optimizer  ...
-HPU:........................forward + backward...optimizer......forward + backward...optimizer
-```
-
-However, when CPU code interpretation takes longer than HPU computation, it becomes the bottleneck and HPU computation can not be triggered until CPU code interpretation is done.
-So one potential optimization for such cases is to trigger the HPU *forward* computation right after the CPU *forward* interpretation and before the CPU *backward* interpretation.
-You can see an example below where the CPU *backward* interpretation overlaps with the HPU *forward* computation:
-
-```
-CPU:...forward   ...backward   ...optimizer  ...forward   ...backward   ...optimizer   ...
-HPU:.............forward.......backward......optimizer......forward.....backward.......optimizer
-```
-
-To enable this optimization, you can set the following training argument [`--pipelining_fwd_bwd True`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainingArguments.pipelining_fwd_bwd).
-
-**We recommend using it on Gaudi2** as the host will often be the bottleneck.
-You should be able to see a speedup on first-generation Gaudi too, but it will be less significant than on Gaudi2 because your run is more likely to be HPU-bound.
-
-Furthermore, *when training models that require large device memory*, we suggest disabling this optimization because *it will increase the HPU memory usage*.
-
-
-## Use More Workers for Data Loading
-
-If the workload of the data loader is heavy, you can increase the number of workers to make your run faster.
-You can enable this with the training argument [`--dataloader_num_workers N`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.dataloader_num_workers) with `N` being the number of workers to use.
-
-**We recommend using it with datasets containing images.**
-Besides, using `--dataloader_num_workers 1` should help in most cases as it enables data loading in a thread different from the main one.
-
-
-## Non-Blocking Data Copy
-
-This optimization is well-suited for models with a high cost of copying data from the host to the device (e.g. vision models like ViT or Swin).
-You can enable it with the training argument [`--non_blocking_data_copy True`](https://huggingface.co/docs/optimum/habana/package_reference/trainer#optimum.habana.GaudiTrainingArguments.non_blocking_data_copy).
-
-**We recommend using it on Gaudi2** where the host can continue to execute other tasks (e.g. graph building) to get a better pipelining between the host and the device.
-On first-generation Gaudi, the device executing time is longer so one should not expect to get any speedup.
-
-
-## Custom Operators
-
-Habana provides a few custom operators that achieve better performance than their PyTorch counterparts on Gaudi.
-You can also define your own custom operator for Gaudi as described [here](https://docs.habana.ai/en/latest/PyTorch/PyTorch_CustomOp_API/page_index.html).
-
-
-### Fused ADAM
-
-Habana provides a [custom fused ADAM implementation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Custom_Ops_PyTorch.html#custom-optimizers).
-It can be used by specifying `"use_fused_adam": true` in the Gaudi configuration file.
-
-<Tip warning={true}>
-
-The default value of *epsilon* is `1e-6` for the Habana fused ADAM optimizer, while it is `1e-8` for `torch.optim.AdamW`.
-
-</Tip>
-
-
-### Fused Gradient Norm Clipping
-
-Habana provides a [custom gradient norm clipping implementation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Custom_Ops_PyTorch.html#other-custom-ops).
-It can be used by specifying `"use_fused_clip_norm": true` in the Gaudi configuration file.
-
-
-## Tracking Memory Usage
-
-Live memory statistics are displayed every `logging_steps` (default is 500) steps:
-- `memory_allocated (GB)` refers to the *current* memory consumption in GB,
-- `max_memory_allocated (GB)` refers to the *maximum* memory consumption reached during the run in GB,
-- `total_memory_available (GB)` refers to the *total* memory available on the device in GB.
-
-These metrics can help you to adjust the batch size of your runs.
-
-<Tip warning={true}>
-
-In distributed mode, memory stats are communicated only by the main process.
-
-</Tip>
-
-You can take a look at [Intel® Gaudi® AI Accelerator's official documentation](https://docs.habana.ai/en/latest/PyTorch/PyTorch_User_Guide/Python_Packages.html#memory-stats-apis) for more information about the memory stats API.
\ No newline at end of file
diff --git a/docs/source/usage_guides/deepspeed.mdx b/docs/source/usage_guides/deepspeed.mdx
deleted file mode 100644
index dfd68b2783..0000000000
--- a/docs/source/usage_guides/deepspeed.mdx
+++ /dev/null
@@ -1,140 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# DeepSpeed for HPUs
-
-[DeepSpeed](https://www.deepspeed.ai/) enables you to fit and train larger models on HPUs thanks to various optimizations described in the [ZeRO paper](https://arxiv.org/abs/1910.02054).
-In particular, you can use the two following ZeRO configurations that have been validated to be fully functioning with Gaudi:
-- **ZeRO-1**: partitions the optimizer states across processes.
-- **ZeRO-2**: partitions the optimizer states + gradients across processes.
-
-These configurations are fully compatible with Habana Mixed Precision and can thus be used to train your model in *bf16* precision.
-
-You can find more information about DeepSpeed Gaudi integration [here](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide).
-
-
-## Setup
-
-To use DeepSpeed on Gaudi, you need to install Optimum Habana and [Habana's DeepSpeed fork](https://github.com/HabanaAI/DeepSpeed) with:
-```bash
-pip install optimum[habana]
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-```
-
-
-## Using DeepSpeed with Optimum Habana
-
-The [`GaudiTrainer`](https://huggingface.co/docs/optimum/habana/package_reference/trainer) allows using DeepSpeed as easily as the [Transformers Trainer](https://huggingface.co/docs/transformers/main_classes/trainer).
-This can be done in 3 steps:
-1. A DeepSpeed configuration has to be defined.
-2. The `deepspeed` training argument enables to specify the path to the DeepSpeed configuration.
-3. The `deepspeed` launcher must be used to run your script.
-
-These steps are detailed below.
-A comprehensive guide about how to use DeepSpeed with the Transformers Trainer is also available [here](https://huggingface.co/docs/transformers/main_classes/deepspeed).
-
-
-### DeepSpeed configuration
-
-The DeepSpeed configuration to use is passed through a JSON file and enables you to choose the optimizations to apply.
-Here is an example for applying ZeRO-2 optimizations and *bf16* precision:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-<Tip>
-
-The special value `"auto"` enables to automatically get the correct or most efficient value.
-You can also specify the values yourself but, if you do so, you should be careful not to have conflicting values with your training arguments.
-It is strongly advised to read [this section](https://huggingface.co/docs/transformers/main_classes/deepspeed#shared-configuration) in the Transformers documentation to completely understand how this works.
-
-</Tip>
-
-Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.16.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Habana.
-
-The [Transformers documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed#configuration) explains how to write a configuration from scratch very well.
-A more complete description of all configuration possibilities is available [here](https://www.deepspeed.ai/docs/config-json/).
-
-
-### The `deepspeed` training argument
-
-To use DeepSpeed, you must specify `deespeed=path_to_my_deepspeed_configuration` in your `GaudiTrainingArguments` instance:
-```python
-training_args = GaudiTrainingArguments(
-    # my usual training arguments...
-    use_habana=True,
-    use_lazy_mode=True,
-    gaudi_config_name=path_to_my_gaudi_config,
-    deepspeed=path_to_my_deepspeed_config,
-)
-```
-
-This argument both indicates that DeepSpeed should be used and points to your DeepSpeed configuration.
-
-
-### Launching your script
-
-Finally, there are two possible ways to launch your script:
-
-1. Using the [gaudi_spawn.py](https://github.com/huggingface/optimum-habana/blob/main/examples/gaudi_spawn.py) script:
-
-```bash
-python gaudi_spawn.py \
-    --world_size number_of_hpu_you_have --use_deepspeed \
-    path_to_script.py --args1 --args2 ... --argsN \
-    --deepspeed path_to_deepspeed_config
-```
-where `--argX` is an argument of the script to run with DeepSpeed.
-
-2. Using the [`DistributedRunner`](https://huggingface.co/docs/optimum/habana/package_reference/distributed_runner) directly in code:
-
-```python
-from optimum.habana.distributed import DistributedRunner
-from optimum.utils import logging
-
-world_size=8 # Number of HPUs to use (1 or 8)
-
-# define distributed runner
-distributed_runner = DistributedRunner(
-    command_list=["scripts/train.py --args1 --args2 ... --argsN --deepspeed path_to_deepspeed_config"],
-    world_size=world_size,
-    use_deepspeed=True,
-)
-
-# start job
-ret_code = distributed_runner.run()
-```
-
-<Tip warning={true}>
-
-You should set `"use_fused_adam": false` in your Gaudi configuration because it is not compatible with DeepSpeed yet.
-
-</Tip>
diff --git a/docs/source/usage_guides/multi_node_training.mdx b/docs/source/usage_guides/multi_node_training.mdx
deleted file mode 100644
index 73239b8f7a..0000000000
--- a/docs/source/usage_guides/multi_node_training.mdx
+++ /dev/null
@@ -1,177 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Multi-node Training
-
-Using several Gaudi servers to perform multi-node training can be done easily. This guide shows how to:
-- set up several Gaudi instances
-- set up your computing environment
-- launch a multi-node run
-
-
-## Setting up several Gaudi instances
-
-Two types of configurations are possible:
-- scale-out using Gaudi NICs or Host NICs (on-premises)
-- scale-out using AWS DL1 instances
-
-
-### On premises
-
-To set up your servers on premises, check out the [installation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html) and [distributed training](https://docs.habana.ai/en/latest/PyTorch/PyTorch_Scaling_Guide/index.html) pages of Habana Gaudi's documentation.
-
-
-### AWS DL1 instances
-
-Proceed with the following steps to correctly set up your DL1 instances.
-
-
-#### 1. Set up an EFA-enabled security group
-
-To allow all instances to communicate with each other, you need to set up a *security group* as described by AWS in step 1 of [this link](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security).
-Once this is done, it should look as follows:
-<p align="center">
-    <img alt="Rules of the security group" src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/habana/multi_node_security_group.png" width="800">
-    <em style="color: grey">Security group for multi-node training on AWS DL1 instances</em>
-</p>
-
-
-#### 2. Launching instances
-
-When you launch instances from the AWS EC2 console, you can choose the number of nodes to set up.
-
-We recommend using the [Habana Deep Learning Base AMI](https://docs.habana.ai/en/latest/Installation_Guide/Habana_Deep_Learning_AMI.html) for your AWS DL1 instances.
-It is an EFA-enabled AMI so you do not need to install the EFA software (which may be necessary if you use a different AMI, installation instructions [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html)).
-
-Then, in the *Network settings*, select the *security group* you created in the previous step. You also have to select a specific *subnet* to unlock the *Advanced network configuration* in which you can enable the *Elastic Fabric Adapter*.
-
-The last parameter to set is the *Placement group* in the *Advanced details*. You can create one if you do not have any. The *placement strategy* should be set to *cluster*.
-
-Here is how it should look:
-<p align="center">
-    <img alt="Rules of the security group" src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/habana/multi_node_launch_instances.png" width="800">
-    <em style="color: grey">Parameters for launching EFA-enabled AWS instances. The important parameters to set are circled in red. For the sake of clarity, not all parameters are represented.</em>
-</p>
-
-More information [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-instances).
-
-
-## Launching a Multi-node Run
-
-Once your Gaudi instances are ready, you need to:
-
-1. Enable password-less SSH on your instances so that they can communicate with each other. [This explains how to do it](https://docs.habana.ai/en/latest/AWS_User_Guides/AWS_Distributed_Training_Multiple_DL1/AWS_Distributed_Training_Multiple_DL1.html#running-distributed-training-over-multiple-dl1-instances).
-2. On AWS, to train through EFA, `hccl_ofi_wrapper` should be installed. [Here is how to do it](https://docs.habana.ai/en/latest/AWS_User_Guides/AWS_Distributed_Training_Multiple_DL1/AWS_Distributed_Training_Multiple_DL1.html#build-and-store-custom-docker-image-for-training).
-3. On AWS, you need to set the following environment variables (the easiest way is to write a `.deepspeed_env` file as described [here](https://huggingface.co/docs/optimum/habana/usage_guides/multi_node_training#environment-variables)):
-  - `HCCL_OVER_OFI=1`
-  - `LD_LIBRARY_PATH=path_to_hccl_ofi_wrapper:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib` where `path_to_hccl_ofi_wrapper` is the path to the `hccl_ofi_wrapper` folder which you installed in the previous step.
-  - (optional) `HCCL_SOCKET_IFNAME=my_network_interface`. If not set, the first network interface with a name that does not start with `lo` or `docker` will be used. More information [here](https://docs.habana.ai/en/latest/API_Reference_Guides/HCCL_APIs/Using_HCCL.html?highlight=HCCL_SOCKET_IFNAME#hccl-socket-ifname).
-
-To make this easier, we provide a Dockerfile [here](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training).
-You will just have to copy the public key of the leader node in the `~/.ssh/authorized_keys` file of all other nodes to enable password-less SSH.
-
-Then, you need to write a [hostfile](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) with the addresses and the numbers of devices of your nodes as follows:
-```
-ip_1 slots=8
-ip_2 slots=8
-...
-ip_n slots=8
-```
-
-Finally, there are two possible ways to run your training script on several nodes:
-
-1. With the [`gaudi_spawn.py`](https://github.com/huggingface/optimum-habana/blob/main/examples/gaudi_spawn.py) script, you can run the following command:
-```bash
-python gaudi_spawn.py \
-    --hostfile path_to_my_hostfile --use_deepspeed \
-    path_to_my_script.py --args1 --args2 ... --argsN \
-    --deepspeed path_to_my_deepspeed_config
-```
-where `--argX` is an argument of the script to run.
-
-2. With the [`DistributedRunner`](https://huggingface.co/docs/optimum/habana/package_reference/distributed_runner), you can add this code snippet to a script:
-```python
-from optimum.habana.distributed import DistributedRunner
-
-distributed_runner = DistributedRunner(
-    command_list=["path_to_my_script.py --args1 --args2 ... --argsN"],
-    hostfile=path_to_my_hostfile,
-    use_deepspeed=True,
-)
-```
-
-
-## Environment Variables
-
-If you need to set environment variables for all nodes, you can specify them in a [`.deepspeed_env`](https://www.deepspeed.ai/getting-started/#multi-node-environment-variables) file which should be located in the local path you are executing from or in your home directory. The format is the following:
-```
-env_variable_1_name=value
-env_variable_2_name=value
-...
-```
-
-You can find an example for AWS instances [here](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training/EFA/.deepspeed_env).
-
-
-## Recommendations
-
-- It is strongly recommended to use gradient checkpointing for multi-node runs to get the highest speedups. You can enable it with `--gradient_checkpointing` in [these examples](https://github.com/huggingface/optimum-habana/tree/main/examples) or with `gradient_checkpointing=True` in your `GaudiTrainingArguments`.
-- Larger batch sizes should lead to higher speedups.
-- Multi-node inference is not recommended and can provide inconsistent results.
-- On AWS DL1 instances, run your Docker containers with the `--privileged` flag so that EFA devices are visible.
-
-
-## Example
-
-In this example, we fine-tune a pre-trained GPT2-XL model on the [WikiText dataset](https://huggingface.co/datasets/wikitext).
-We are going to use the [causal language modeling example which is given in the Github repository](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling#gpt-2gpt-and-causal-language-modeling).
-
-The first step consists in training the model on several nodes with this command:
-```bash
-python ../gaudi_spawn.py \
-    --hostfile path_to_hostfile --use_deepspeed run_clm.py \
-    --model_name_or_path gpt2-xl \
-    --gaudi_config_name Habana/gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --do_train \
-    --output_dir /tmp/gpt2_xl_multi_node \
-    --learning_rate 4e-04 \
-    --per_device_train_batch_size 16 \
-    --gradient_checkpointing \
-    --num_train_epochs 1 \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_deepspeed_config
-```
-
-Evaluation is not performed in the same command because we do not recommend performing multi-node inference at the moment.
-
-Once the model is trained, we can evaluate it with the following command.
-The argument `--model_name_or_path` should be equal to the argument `--output_dir` of the previous command.
-```bash
-python run_clm.py \
-    --model_name_or_path /tmp/gpt2_xl_multi_node \
-    --gaudi_config_name Habana/gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --do_eval \
-    --output_dir /tmp/gpt2_xl_multi_node \
-    --per_device_eval_batch_size 8 \
-    --use_habana \
-    --use_lazy_mode
-```
diff --git a/docs/source/usage_guides/overview.mdx b/docs/source/usage_guides/overview.mdx
deleted file mode 100644
index 426f702645..0000000000
--- a/docs/source/usage_guides/overview.mdx
+++ /dev/null
@@ -1,25 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Overview
-
-Welcome to the Optimum for Intel Gaudi how-to guides!
-These guides tackle more advanced topics and will show you how to easily get the best from HPUs:
-- [Pretraining models](./pretraining)
-- [Accelerating training](./accelerate_training)
-- [Accelerating inference](./accelerate_inference)
-- [Using DeepSpeed](./deepspeed) to train larger models
-- [Multi-node training](./multi_node_training) for faster runs
diff --git a/docs/source/usage_guides/pretraining.mdx b/docs/source/usage_guides/pretraining.mdx
deleted file mode 100644
index 39a94c504d..0000000000
--- a/docs/source/usage_guides/pretraining.mdx
+++ /dev/null
@@ -1,72 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-
-# Pretraining Transformers with Optimum Habana
-
-Pretraining a model from Transformers, like BERT, is as easy as fine-tuning it.
-The model should be instantiated from a configuration with `.from_config` and not from a pretrained checkpoint with `.from_pretrained`.
-Here is how it should look with GPT2 for instance:
-```python
-from transformers import AutoConfig, AutoModelForXXX
-
-config = AutoConfig.from_pretrained("gpt2")
-model = AutoModelForXXX.from_config(config)
-```
-with XXX the task to perform, such as `ImageClassification` for example.
-
-The following is a working example where BERT is pretrained for masked language modeling:
-```python
-from datasets import load_dataset
-from optimum.habana import GaudiTrainer, GaudiTrainingArguments
-from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling
-
-# Load the training set (this one has already been preprocessed)
-training_set = load_dataset("philschmid/processed_bert_dataset", split="train")
-# Load the tokenizer
-tokenizer = AutoTokenizer.from_pretrained("philschmid/bert-base-uncased-2022-habana")
-
-# Instantiate an untrained model
-config = AutoConfig.from_pretrained("bert-base-uncased")
-model = AutoModelForMaskedLM.from_config(config)
-
-model.resize_token_embeddings(len(tokenizer))
-
-# The data collator will take care of randomly masking the tokens
-data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)
-
-training_args = GaudiTrainingArguments(
-    output_dir="/tmp/bert-base-uncased-mlm",
-    num_train_epochs=1,
-    per_device_train_batch_size=8,
-    use_habana=True,
-    use_lazy_mode=True,
-    gaudi_config_name="Habana/bert-base-uncased",
-)
-
-# Initialize our Trainer
-trainer = GaudiTrainer(
-    model=model,
-    args=training_args,
-    train_dataset=training_set,
-    tokenizer=tokenizer,
-    data_collator=data_collator,
-)
-
-trainer.train()
-```
-
-You can see another example of pretraining in [this blog post](https://huggingface.co/blog/pretraining-bert).
diff --git a/examples/README.md b/examples/README.md
deleted file mode 100644
index 9b4a65f31d..0000000000
--- a/examples/README.md
+++ /dev/null
@@ -1,124 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Examples
-
-This folder contains actively maintained examples of use of 🤗 Optimum Habana for various ML tasks.
-
-Other [examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch) from the 🤗 Transformers library can be adapted the same way to enable deployment on Gaudi processors. This simply consists in:
-- replacing the `Trainer` from 🤗 Transformers with the `GaudiTrainer` from 🤗 Optimum Habana,
-- replacing the `TrainingArguments` from 🤗 Transformers with the `GaudiTrainingArguments` from 🤗 Optimum Habana.
-
-
-## Distributed training
-
-All the PyTorch training scripts in this repository work out of the box with distributed training.
-
-
-### Single node
-
-To launch a script on _n_ HPUs belonging to a single Gaudi server, use the following command:
-
-```bash
-python gaudi_spawn.py \
-    --world_size number_of_hpu_you_have --use_mpi \
-    path_to_script.py --args1 --args2 ... --argsN
-```
-where `--argX` is an argument of the script to run in a distributed way.
-
-
-### DeepSpeed
-
-All the PyTorch training scripts in this repository work out of the box with DeepSpeed. To launch one of them on _n_ HPUs, use the following command:
-
-```bash
-python gaudi_spawn.py \
-    --world_size number_of_hpu_you_have --use_deepspeed \
-    path_to_script.py --args1 --args2 ... --argsN \
-    --deepspeed path_to_my_deepspeed_config
-```
-where `--argX` is an argument of the script to run with DeepSpeed.
-
-
-### Multi node
-
-All the PyTorch training scripts in this repository work out of the box on several Gaudi instances. To launch one of them on _n_ nodes, use the following command:
-
-```bash
-python gaudi_spawn.py \
-    --hostfile path_to_my_hostfile --use_deepspeed \
-    path_to_my_script.py --args1 --args2 ... --argsN \
-    --deepspeed path_to_my_deepspeed_config
-```
-where `--argX` is an argument of the script to run with DeepSpeed and `--hostfile` is [a file specifying the addresses and the number of devices to use for each node](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) such as:
-```
-ip_1 slots=8
-ip_2 slots=8
-...
-ip_n slots=8
-```
-
-You can find more information about multi-node training in the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/multi_node_training) and in the [`multi-node-training`](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training) folder where a Dockerfile is provided to easily set up your environment.
-
-
-## Loading from a Tensorflow/Flax checkpoint file instead of a PyTorch model
-
-If a model also has Tensorflow or Flax checkpoints, you can load them instead of a PyTorch checkpoint by specifying `from_tf=True` or `from_flax=True` in the model instantiation.
-
-You can try it for SQuAD [here](https://github.com/huggingface/optimum-habana/blob/688a857d5308a87a502eec7657f744429125d6f1/examples/question-answering/run_qa.py#L310) or MRPC [here](https://github.com/huggingface/optimum-habana/blob/688a857d5308a87a502eec7657f744429125d6f1/examples/text-classification/run_glue.py#L338).
-
-You can check if a model has such checkpoints on the [Hub](https://huggingface.co/models). You can also specify a URL or a path to a Tensorflow/Flax checkpoint in `model_args.model_name_or_path`.
-
-> Resuming from a checkpoint will only work with a PyTorch checkpoint.
-
-
-## Running quick tests
-
-Most examples are equipped with a mechanism to truncate the number of dataset samples to the desired length. This is useful for debugging purposes, for example to quickly check that all stages of the programs can complete, before running the same setup on the full dataset which may take hours to complete.
-
-For example here is how to truncate all three splits to just 50 samples each:
-```
-examples/pytorch/question-answering/run_squad.py \
---max_train_samples 50 \
---max_eval_samples 50 \
---max_predict_samples 50 \
-[...]
-```
-
-
-## Resuming training
-
-You can resume training from a previous checkpoint like this:
-
-1. Pass `--output_dir previous_output_dir` without `--overwrite_output_dir` to resume training from the latest checkpoint in `output_dir` (what you would use if the training was interrupted, for instance).
-2. Pass `--resume_from_checkpoint path_to_a_specific_checkpoint` to resume training from that checkpoint folder.
-
-Should you want to turn an example into a notebook where you'd no longer have access to the command
-line, 🤗 GaudiTrainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`.
-
-1. If `resume_from_checkpoint` is `True` it will look for the last checkpoint in the value of `output_dir` passed via `TrainingArguments`.
-2. If `resume_from_checkpoint` is a path to a specific checkpoint it will use that saved checkpoint folder to resume the training.
-
-
-## Uploading the trained/fine-tuned model to the Hub
-
-All the example scripts support the automatic upload of your final model to the [Model Hub](https://huggingface.co/models) by adding a `--push_to_hub` argument. It will then create a repository with your username slash the name of the folder you are using as `output_dir`. For instance, `"sgugger/test-mrpc"` if your username is `sgugger` and you are working in the folder `~/tmp/test-mrpc`.
-
-To specify a given repository name, use the `--hub_model_id` argument. You will need to specify the whole repository name (including your username), for instance `--hub_model_id sgugger/finetuned-bert-mrpc`. To upload to an organization you are a member of, just use the name of that organization instead of your username: `--hub_model_id huggingface/finetuned-bert-mrpc`.
-
-A few notes on this integration:
-
-- you will need to be logged in to the Hugging Face website locally for it to work, the easiest way to achieve this is to run `huggingface-cli login` and then type your username and password when prompted. You can also pass along your authentication token with the `--hub_token` argument.
-- the `output_dir` you pick will either need to be a new folder or a local clone of the distant repository you are using.
diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md
deleted file mode 100644
index 7e91e46eac..0000000000
--- a/examples/audio-classification/README.md
+++ /dev/null
@@ -1,202 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Audio Classification Examples
-
-The following examples showcase how to fine-tune `Wav2Vec2` for audio classification on Habana Gaudi.
-
-Speech recognition models that have been pretrained in an unsupervised fashion on audio data alone, *e.g.* [Wav2Vec2](https://huggingface.co/transformers/main/model_doc/wav2vec2.html), have shown to require only very little annotated data to yield good performance on speech classification datasets.
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Single-HPU
-
-The following command shows how to fine-tune [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the 🗣️ [Keyword Spotting subset](https://huggingface.co/datasets/superb#ks) of the SUPERB dataset on a single HPU.
-
-```bash
-python run_audio_classification.py \
-    --model_name_or_path facebook/wav2vec2-base \
-    --dataset_name superb \
-    --dataset_config_name ks \
-    --output_dir /tmp/wav2vec2-base-ft-keyword-spotting \
-    --overwrite_output_dir \
-    --remove_unused_columns False \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --max_length_seconds 1 \
-    --attention_mask False \
-    --warmup_ratio 0.1 \
-    --num_train_epochs 5 \
-    --per_device_train_batch_size 256 \
-    --per_device_eval_batch_size 256 \
-    --dataloader_num_workers 4 \
-    --seed 27 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/wav2vec2 \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-On a single HPU, this script should run in ~13 minutes and yield an accuracy of **97.96%**.
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
-## Multi-HPU
-
-The following command shows how to fine-tune [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) for 🌎 **Language Identification** on the [CommonLanguage dataset](https://huggingface.co/datasets/anton-l/common_language) on 8 HPUs.
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_audio_classification.py \
-    --model_name_or_path facebook/wav2vec2-base \
-    --dataset_name common_language \
-    --audio_column_name audio \
-    --label_column_name language \
-    --output_dir /tmp/wav2vec2-base-lang-id \
-    --overwrite_output_dir \
-    --remove_unused_columns False \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-4 \
-    --max_length_seconds 8 \
-    --attention_mask False \
-    --warmup_ratio 0.1 \
-    --num_train_epochs 10 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 32 \
-    --seed 0 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/wav2vec2 \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.49%**.
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-> If you get an error reporting unused parameters in the model, you can specify `--ddp_find_unused_parameters True`. Using this parameter might affect the training speed.
-
-
-## DeepSpeed
-
-> You need to install DeepSpeed with:
-> ```bash
-> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-> ```
-
-DeepSpeed can be used with almost the same command as for a multi-card run:
-- `use_mpi` should be replaced by `use_deepspeed`,
-- an additional `--deepspeed path_to_my_deepspeed config` argument should be provided, for instance `--deepspeed ../../tests/configs/deepspeed_zero_2.json`.
-
-For example:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_audio_classification.py \
-    --model_name_or_path facebook/wav2vec2-base \
-    --dataset_name common_language \
-    --audio_column_name audio \
-    --label_column_name language \
-    --output_dir /tmp/wav2vec2-base-lang-id \
-    --overwrite_output_dir \
-    --remove_unused_columns False \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-4 \
-    --max_length_seconds 8 \
-    --attention_mask False \
-    --warmup_ratio 0.1 \
-    --num_train_epochs 10 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 32 \
-    --seed 0 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/wav2vec2 \
-    --throughput_warmup_steps 3 \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json
-```
-
-[The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with Wav2Vec2 on the Keyword Spotting subset on 1 Gaudi card with the following command:
-```bash
-python run_audio_classification.py \
-    --model_name_or_path facebook/wav2vec2-base \
-    --dataset_name superb \
-    --dataset_config_name ks \
-    --output_dir /tmp/wav2vec2-base-ft-keyword-spotting \
-    --overwrite_output_dir \
-    --remove_unused_columns False \
-    --do_eval \
-    --max_length_seconds 1 \
-    --attention_mask False \
-    --per_device_eval_batch_size 256 \
-    --dataloader_num_workers 4 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/wav2vec2 \
-    --bf16
-```
-
-
-## Sharing your model on 🤗 Hub
-
-0. If you haven't already, [sign up](https://huggingface.co/join) for a 🤗 account
-
-1. Make sure you have `git-lfs` installed and git set up.
-
-```bash
-$ apt install git-lfs
-```
-
-2. Log in with your HuggingFace account credentials using `huggingface-cli`
-
-```bash
-$ huggingface-cli login
-# ...follow the prompts
-```
-
-3. When running the script, pass the following arguments:
-
-```bash
-python run_audio_classification.py \
-    --push_to_hub \
-    --hub_model_id <username/model_id> \
-    ...
-```
diff --git a/examples/audio-classification/requirements.txt b/examples/audio-classification/requirements.txt
deleted file mode 100644
index 05e54e03f6..0000000000
--- a/examples/audio-classification/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-datasets >= 1.14.0, <= 2.19.2
-evaluate == 0.4.2
-librosa == 0.10.2.post1
diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
deleted file mode 100644
index b8c1e146c9..0000000000
--- a/examples/audio-classification/run_audio_classification.py
+++ /dev/null
@@ -1,441 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from random import randint
-from typing import Optional
-
-import datasets
-import evaluate
-import numpy as np
-import transformers
-from datasets import DatasetDict, load_dataset
-from transformers import AutoConfig, AutoFeatureExtractor, AutoModelForAudioClassification, HfArgumentParser
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
-
-
-def random_subsample(wav: np.ndarray, max_length: float, sample_rate: int = 16000):
-    """Randomly sample chunks of `max_length` seconds from the input audio"""
-    sample_length = int(round(sample_rate * max_length))
-    if len(wav) <= sample_length:
-        return wav
-    random_offset = randint(0, len(wav) - sample_length - 1)
-    return wav[random_offset : random_offset + sample_length]
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: Optional[str] = field(default=None, metadata={"help": "Name of a dataset from the datasets package"})
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "A file containing the training audio paths and labels."}
-    )
-    eval_file: Optional[str] = field(
-        default=None, metadata={"help": "A file containing the validation audio paths and labels."}
-    )
-    train_split_name: str = field(
-        default="train",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    eval_split_name: str = field(
-        default="validation",
-        metadata={
-            "help": (
-                "The name of the training data set split to use (via the datasets library). Defaults to 'validation'"
-            )
-        },
-    )
-    audio_column_name: str = field(
-        default="audio",
-        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
-    )
-    label_column_name: str = field(
-        default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"}
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_length_seconds: float = field(
-        default=20,
-        metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."},
-    )
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        default="facebook/wav2vec2-base",
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from the Hub"}
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    feature_extractor_name: Optional[str] = field(
-        default=None, metadata={"help": "Name or path of preprocessor config."}
-    )
-    freeze_feature_encoder: bool = field(
-        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
-    )
-    attention_mask: bool = field(
-        default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    ignore_mismatched_sizes: bool = field(
-        default=False,
-        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_audio_classification", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to train from scratch."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Initialize our dataset and prepare it for the audio classification task.
-    raw_datasets = DatasetDict()
-    raw_datasets["train"] = load_dataset(
-        data_args.dataset_name,
-        data_args.dataset_config_name,
-        split=data_args.train_split_name,
-        token=model_args.token,
-    )
-    raw_datasets["eval"] = load_dataset(
-        data_args.dataset_name,
-        data_args.dataset_config_name,
-        split=data_args.eval_split_name,
-        token=model_args.token,
-    )
-
-    if data_args.audio_column_name not in raw_datasets["train"].column_names:
-        raise ValueError(
-            f"--audio_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--audio_column_name` to the correct audio column - one of "
-            f"{', '.join(raw_datasets['train'].column_names)}."
-        )
-
-    if data_args.label_column_name not in raw_datasets["train"].column_names:
-        raise ValueError(
-            f"--label_column_name {data_args.label_column_name} not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--label_column_name` to the correct text column - one of "
-            f"{', '.join(raw_datasets['train'].column_names)}."
-        )
-
-    # Setting `return_attention_mask=True` is the way to get a correctly masked mean-pooling over
-    # transformer outputs in the classifier, but it doesn't always lead to better accuracy
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name or model_args.model_name_or_path,
-        return_attention_mask=model_args.attention_mask,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    # `datasets` takes care of automatically loading and resampling the audio,
-    # so we just need to set the correct target sampling rate.
-    raw_datasets = raw_datasets.cast_column(
-        data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
-    )
-
-    # Max input length
-    max_length = int(round(feature_extractor.sampling_rate * data_args.max_length_seconds))
-
-    model_input_name = feature_extractor.model_input_names[0]
-
-    def train_transforms(batch):
-        """Apply train_transforms across a batch."""
-        subsampled_wavs = []
-
-        for audio in batch[data_args.audio_column_name]:
-            wav = random_subsample(
-                audio["array"], max_length=data_args.max_length_seconds, sample_rate=feature_extractor.sampling_rate
-            )
-            subsampled_wavs.append(wav)
-        inputs = feature_extractor(
-            subsampled_wavs,
-            max_length=max_length,
-            sampling_rate=feature_extractor.sampling_rate,
-            padding="max_length",
-            truncation=True,
-        )
-        output_batch = {model_input_name: inputs.get(model_input_name)}
-        output_batch["labels"] = list(batch[data_args.label_column_name])
-
-        return output_batch
-
-    def val_transforms(batch):
-        """Apply val_transforms across a batch."""
-        wavs = [audio["array"] for audio in batch[data_args.audio_column_name]]
-        inputs = feature_extractor(
-            wavs,
-            max_length=max_length,
-            sampling_rate=feature_extractor.sampling_rate,
-            padding="max_length",
-            truncation=True,
-        )
-        output_batch = {model_input_name: inputs.get(model_input_name)}
-        output_batch["labels"] = list(batch[data_args.label_column_name])
-
-        return output_batch
-
-    # Prepare label mappings.
-    # We'll include these in the model's config to get human readable labels in the Inference API.
-    labels = raw_datasets["train"].features[data_args.label_column_name].names
-    label2id, id2label = {}, {}
-    for i, label in enumerate(labels):
-        label2id[label] = str(i)
-        id2label[str(i)] = label
-
-    # Load the accuracy metric from the datasets package
-    metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
-
-    # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with
-    # `predictions` and `label_ids` fields) and has to return a dictionary string to float.
-    def compute_metrics(eval_pred):
-        """Computes accuracy on a batch of predictions"""
-        predictions = np.argmax(eval_pred.predictions, axis=1)
-        return metric.compute(predictions=predictions, references=eval_pred.label_ids)
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name or model_args.model_name_or_path,
-        num_labels=len(labels),
-        label2id=label2id,
-        id2label=id2label,
-        finetuning_task="audio-classification",
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    model = AutoModelForAudioClassification.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
-    )
-
-    # freeze the convolutional waveform encoder if supported by model
-    if hasattr(model, "freeze_feature_encoder") and model_args.freeze_feature_encoder:
-        model.freeze_feature_encoder()
-
-    if training_args.do_train:
-        if data_args.max_train_samples is not None:
-            raw_datasets["train"] = (
-                raw_datasets["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples))
-            )
-        # Set the training transforms
-        raw_datasets["train"].set_transform(train_transforms, output_all_columns=False)
-
-    if training_args.do_eval:
-        if data_args.max_eval_samples is not None:
-            raw_datasets["eval"] = (
-                raw_datasets["eval"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
-            )
-        # Set the validation transforms
-        raw_datasets["eval"].set_transform(val_transforms, output_all_columns=False)
-
-    # Initialize our trainer
-    trainer = GaudiTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=raw_datasets["train"] if training_args.do_train else None,
-        eval_dataset=raw_datasets["eval"] if training_args.do_eval else None,
-        compute_metrics=compute_metrics,
-        tokenizer=feature_extractor,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        metrics = trainer.evaluate()
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Write model card and (optionally) push to hub
-    kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
-        "tasks": "audio-classification",
-        "dataset": data_args.dataset_name,
-        "tags": ["audio-classification"],
-    }
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md
deleted file mode 100644
index d19ddcaad8..0000000000
--- a/examples/contrastive-image-text/README.md
+++ /dev/null
@@ -1,266 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# VisionTextDualEncoder and CLIP-like model training examples
-
-This folder contains two examples:
-
-1. The first one showcases how to train a CLIP-like vision-text dual encoder model using pre-trained vision and text encoders. The model is inspired by [CLIP](https://openai.com/blog/clip/), introduced by Alec Radford et al. The idea is to train a vision encoder and a text encoder jointly to project the representation of images and their captions into the same embedding space, such that the caption embeddings are located near the embeddings of the images they describe.
-2. The second one showcases how to train a [BridgeTower](https://arxiv.org/abs/2206.08657) model. This model contains bridges between the text and vision encoders that are linked to a cross-modal encoder. This enables effective bottom-up cross-modal alignment between visual and textual representations at different semantic levels in the cross-modal encoder.
-
-Such models can be used for natural language image search and potentially zero-shot image classification.
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Download COCO dataset (2017)
-This example uses COCO dataset (2017) through a custom dataset script, which requires users to manually download the
-COCO dataset before training.
-
-```bash
-mkdir data
-cd data
-wget http://images.cocodataset.org/zips/train2017.zip
-wget http://images.cocodataset.org/zips/val2017.zip
-wget http://images.cocodataset.org/zips/test2017.zip
-wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
-wget http://images.cocodataset.org/annotations/image_info_test2017.zip
-cd ..
-```
-
-Having downloaded COCO dataset manually you should be able to load with the `ydshieh/coco_dataset_script` dataset loading script:
-
-```py
-import os
-import datasets
-
-COCO_DIR = os.path.join(os.getcwd(), "data")
-ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_DIR)
-```
-
-## CLIP-like models
-
-Here is how to run the `run_clip.py` script for training CLIP-like models.
-
-
-### Create a model from a vision encoder model and a text encoder model
-Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
-The `VisionTextDualEncoderModel` class lets you load any vision and text encoder model to create a dual encoder.
-Here is an example of how to load the model using pre-trained vision and text models.
-
-```python3
-from transformers import (
-    VisionTextDualEncoderModel,
-    VisionTextDualEncoderProcessor,
-    AutoTokenizer,
-    AutoImageProcessor
-)
-
-model = VisionTextDualEncoderModel.from_vision_text_pretrained(
-    "openai/clip-vit-base-patch32", "roberta-base"
-)
-
-tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
-processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
-
-# save the model and processor
-model.save_pretrained("clip-roberta")
-processor.save_pretrained("clip-roberta")
-```
-
-This loads both the text and vision encoders using pre-trained weights, the projection layers are randomly
-initialized except for CLIP's vision model. If you use CLIP to initialize the vision model then the vision projection weights are also
-loaded using the pre-trained weights.
-
-### Single-HPU training
-
-Finally, we can run the example script to train the model.
-Run the following command for single-device training:
-
-```bash
-python run_clip.py \
-    --output_dir ./clip-roberta-finetuned \
-    --model_name_or_path ./clip-roberta \
-    --data_dir $PWD/data \
-    --dataset_name ydshieh/coco_dataset_script \
-    --dataset_config_name=2017 \
-    --image_column image_path \
-    --caption_column caption \
-    --remove_unused_columns=False \
-    --do_train  --do_eval \
-    --per_device_train_batch_size="512" \
-    --per_device_eval_batch_size="64" \
-    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
-    --overwrite_output_dir \
-    --save_strategy epoch \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/clip \
-    --throughput_warmup_steps 3 \
-    --dataloader_num_workers 16 \
-    --bf16
-```
-
-
-### Multi-HPU training
-
-Run the following command for distributed training:
-
-```bash
-python ../gaudi_spawn.py --world_size 8 --use_mpi run_clip.py \
-    --output_dir ./clip-roberta-finetuned \
-    --model_name_or_path ./clip-roberta \
-    --data_dir $PWD/data \
-    --dataset_name ydshieh/coco_dataset_script \
-    --dataset_config_name=2017 \
-    --image_column image_path \
-    --caption_column caption \
-    --remove_unused_columns=False \
-    --do_train  --do_eval \
-    --per_device_train_batch_size="512" \
-    --per_device_eval_batch_size="64" \
-    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
-    --overwrite_output_dir \
-    --save_strategy epoch \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/clip \
-    --throughput_warmup_steps 3 \
-    --dataloader_num_workers 16 \
-    --mediapipe_dataloader \
-    --use_hpu_graphs_for_training \
-    --bf16 \
-    --distribution_strategy fast_ddp
-```
-
-> `--mediapipe_dataloader` only works on Gaudi2.
-
-
-### DeepSpeed
-
-Run the following command for training with DeepSpeed:
-
-```bash
-python ../gaudi_spawn.py --world_size 8 --use_deepspeed run_clip.py \
-    --output_dir ./clip-roberta-finetuned \
-    --model_name_or_path ./clip-roberta \
-    --data_dir $PWD/data \
-    --dataset_name ydshieh/coco_dataset_script \
-    --dataset_config_name=2017 \
-    --image_column image_path \
-    --caption_column caption \
-    --remove_unused_columns=False \
-    --do_train  --do_eval \
-    --per_device_train_batch_size="512" \
-    --per_device_eval_batch_size="64" \
-    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
-    --overwrite_output_dir \
-    --save_strategy epoch \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/clip \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-
-## BridgeTower
-
-For training BridgeTower, you need to run the `run_bridgetower.py` script.
-For instance, to reproduce the results presented in [this blog post](https://huggingface.co/blog/bridgetower), you should run:
-
-```bash
-python ../gaudi_spawn.py --use_mpi --world_size 8 run_bridgetower.py \
-  --output_dir /tmp/bridgetower-test \
-  --model_name_or_path BridgeTower/bridgetower-large-itm-mlm-itc \
-  --dataset_name jmhessel/newyorker_caption_contest --dataset_config_name matching \
-  --dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6 \
-  --image_column image --caption_column image_description \
-  --remove_unused_columns=False \
-  --do_train --do_eval --do_predict \
-  --per_device_train_batch_size="40" --per_device_eval_batch_size="16" \
-  --num_train_epochs 5 \
-  --learning_rate="1e-5" \
-  --overwrite_output_dir \
-  --save_strategy no \
-  --use_habana --use_lazy_mode --use_hpu_graphs_for_inference --gaudi_config_name Habana/clip \
-  --throughput_warmup_steps 3 \
-  --logging_steps 10 \
-  --dataloader_num_workers 1 \
-  --mediapipe_dataloader \
-  --distribution_strategy fast_ddp
-```
-
-> `--mediapipe_dataloader` only works on Gaudi2.
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with CLIP on COCO on 1 Gaudi card with the following command:
-```bash
-python run_clip.py \
-    --output_dir ./clip-roberta-finetuned \
-    --model_name_or_path ./clip-roberta \
-    --data_dir $PWD/data \
-    --dataset_name ydshieh/coco_dataset_script \
-    --dataset_config_name=2017 \
-    --image_column image_path \
-    --caption_column caption \
-    --remove_unused_columns=False \
-    --do_eval \
-    --per_device_eval_batch_size="64" \
-    --overwrite_output_dir \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/clip \
-    --bf16 \
-    --mediapipe_dataloader
-```
-
-> `--mediapipe_dataloader` only works on Gaudi2.
diff --git a/examples/contrastive-image-text/clip_media_pipe.py b/examples/contrastive-image-text/clip_media_pipe.py
deleted file mode 100755
index be2fa4a419..0000000000
--- a/examples/contrastive-image-text/clip_media_pipe.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from torch.utils.data.sampler import BatchSampler
-
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-try:
-    from habana_frameworks.mediapipe import fn
-    from habana_frameworks.mediapipe.media_types import dtype, ftype, imgtype, randomCropType, readerOutType
-    from habana_frameworks.mediapipe.mediapipe import MediaPipe
-    from habana_frameworks.mediapipe.operators.reader_nodes.read_image_from_dir import get_max_file
-    from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import (
-        media_ext_reader_op_impl,
-        media_ext_reader_op_tensor_info,
-    )
-except ImportError:
-    pass
-
-read_image_text_from_dataset_params = {
-    "label_dtype": dtype.UINT32,
-    "dataset": None,
-}
-
-
-class read_image_text_from_dataset(media_ext_reader_op_impl):
-    """
-    Class defining read image/text from clip dataset.
-
-    """
-
-    def __init__(self, params, fw_params):
-        self.batch_size = 1
-        params = params["priv_params"]
-        self.meta_dtype = params["label_dtype"]
-        self.dataset = params["dataset"]
-        self.epoch = 0
-        self.batch_sampler_iter = None
-        self.iter_loc = 0
-        self.num_imgs_slice = len(ClipMediaPipe.batch_sampler.sampler)
-        self.num_batches_slice = len(ClipMediaPipe.batch_sampler)
-
-        logger.info("Finding largest file ...")
-        if "image_path" in self.dataset.column_names:
-            self.max_file = get_max_file(self.dataset["image_path"])
-        else:
-            self.max_file = get_max_file([img["path"] for img in self.dataset["image"]])
-        logger.info(f"The largest file is {self.max_file}.")
-        self.batch_size = fw_params.batch_size
-
-    def gen_output_info(self):
-        out_info = []
-        o = media_ext_reader_op_tensor_info(dtype.NDT, np.array([self.batch_size], dtype=np.uint32), "")
-        out_info.append(o)
-        o = media_ext_reader_op_tensor_info(
-            self.meta_dtype, np.array([self.dataset.text_max_length, self.batch_size], dtype=np.uint32), ""
-        )
-        out_info.append(o)
-        o = media_ext_reader_op_tensor_info(
-            self.meta_dtype, np.array([self.dataset.text_max_length, self.batch_size], dtype=np.uint32), ""
-        )
-        out_info.append(o)
-        return out_info
-
-    def get_largest_file(self):
-        return self.max_file
-
-    def get_media_output_type(self):
-        return readerOutType.FILE_LIST
-
-    def __len__(self):
-        return self.num_batches_slice
-
-    def __iter__(self):
-        self.iter_loc = 0
-        self.batch_sampler_iter = iter(ClipMediaPipe.batch_sampler)
-        self.epoch += 1
-        return self
-
-    def __next__(self):
-        if self.iter_loc > (self.num_imgs_slice - 1):
-            raise StopIteration
-
-        data_idx = next(self.batch_sampler_iter)
-        data = self.dataset.__getitems__(data_idx)
-        img_list = []
-
-        input_id_list = np.zeros(shape=(self.batch_size, self.dataset.text_max_length), dtype=self.meta_dtype)
-        attention_mask_list = np.zeros(shape=(self.batch_size, self.dataset.text_max_length), dtype=self.meta_dtype)
-        for i, x in enumerate(data):
-            if "image_path" in self.dataset.column_names:
-                img_list.append(x["image_path"])
-            else:
-                img_list.append(x["image"]["path"])
-            input_id_list[i, :] = x["input_ids"]
-            attention_mask_list[i, :] = x["attention_mask"]
-
-        self.iter_loc = self.iter_loc + self.batch_size
-
-        return img_list, input_id_list, attention_mask_list
-
-
-class ClipMediaPipe(MediaPipe):
-    """
-    Class defining clip media pipe:
-        read data --> image decoding (include crop and resize) --> crop mirror normalize
-
-    Original set of PyTorch transformations:
-        aspect ratio preserving resize -> center crop -> normalize
-
-    """
-
-    batch_sampler = None
-    instance_count = 0
-
-    def __init__(self, dataset=None, sampler=None, batch_size=512, drop_last=False, queue_depth=1):
-        self.device = "legacy"
-        self.dataset = dataset
-        self.drop_last = drop_last
-        self.sampler = sampler
-        ClipMediaPipe.batch_sampler = BatchSampler(sampler, batch_size, drop_last)
-        self.image_size = self.dataset.image_resize
-
-        pipe_name = "{}:{}".format(self.__class__.__name__, ClipMediaPipe.instance_count)
-        pipe_name = str(pipe_name)
-
-        super(ClipMediaPipe, self).__init__(
-            device=self.device, batch_size=batch_size, prefetch_depth=queue_depth, pipe_name=pipe_name
-        )
-        params = read_image_text_from_dataset_params.copy()
-        params["dataset"] = self.dataset
-        self.input = fn.MediaExtReaderOp(
-            impl=read_image_text_from_dataset,
-            num_outputs=3,
-            priv_params=params,
-        )
-        def_output_image_size = [self.image_size, self.image_size]
-        res_pp_filter = ftype.BICUBIC
-        self.decode = fn.ImageDecoder(
-            device="hpu",
-            output_format=imgtype.RGB_P,
-            random_crop_type=randomCropType.CENTER_CROP,
-            resize=def_output_image_size,
-            resampling_mode=res_pp_filter,
-        )
-
-        cmn_pos_offset = 0.5
-        normalize_mean = np.array(self.dataset.image_mean).astype(np.float32) * 255
-        normalize_std = 1 / (np.array(self.dataset.image_std).astype(np.float32) * 255)
-        norm_mean = fn.MediaConst(data=normalize_mean, shape=[1, 1, 3], dtype=dtype.FLOAT32)
-        norm_std = fn.MediaConst(data=normalize_std, shape=[1, 1, 3], dtype=dtype.FLOAT32)
-        self.cmn = fn.CropMirrorNorm(
-            crop_w=self.image_size,
-            crop_h=self.image_size,
-            crop_pos_x=cmn_pos_offset,
-            crop_pos_y=cmn_pos_offset,
-            crop_d=0,
-            dtype=dtype.FLOAT32,
-        )
-        self.mean = norm_mean()
-        self.std = norm_std()
-
-        ClipMediaPipe.instance_count += 1
-
-    def definegraph(self):
-        jpegs, input_ids, attention_masks = self.input()
-        images = self.decode(jpegs)
-        images = self.cmn(images, self.mean, self.std)
-        return images, input_ids, attention_masks
diff --git a/examples/contrastive-image-text/clip_mediapipe_dataloader.py b/examples/contrastive-image-text/clip_mediapipe_dataloader.py
deleted file mode 100644
index 1a5958a519..0000000000
--- a/examples/contrastive-image-text/clip_mediapipe_dataloader.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class MediaApiDataLoader(torch.utils.data.DataLoader):
-    def __init__(
-        self,
-        dataset,
-        batch_size=1,
-        sampler=None,
-        collate_fn=None,
-        drop_last=False,
-        num_workers=0,
-        pin_memory=False,
-        worker_init_fn=None,
-    ):
-        self.dataset = dataset
-        self.sampler = sampler
-        self.fallback_activated = False
-
-        try:
-            from clip_media_pipe import ClipMediaPipe
-            from habana_frameworks.mediapipe.plugins.iterator_pytorch import HPUGenericPytorchIterator
-
-            pipeline = ClipMediaPipe(
-                dataset=dataset,
-                sampler=sampler,
-                batch_size=batch_size,
-                drop_last=drop_last,
-                queue_depth=3,
-            )
-            self.iterator = HPUGenericPytorchIterator(mediapipe=pipeline)
-        except Exception as e:
-            logger.warning(f"Using Pytorch native dataloader because: {e}.")
-            self.fallback_activated = True
-            dataset.set_transform(dataset.transform_func)
-            super(MediaApiDataLoader, self).__init__(
-                dataset,
-                batch_size=batch_size,
-                sampler=sampler,
-                collate_fn=collate_fn,
-                drop_last=drop_last,
-                num_workers=num_workers,
-                pin_memory=pin_memory,
-                worker_init_fn=worker_init_fn,
-            )
-
-    def __len__(self):
-        if self.fallback_activated:
-            return super().__len__()
-        else:
-            return len(self.iterator)
-
-    def __iter__(self):
-        if self.fallback_activated:
-            return super().__iter__()
-        else:
-            self.iterator.__iter__()
-        return self
-
-    def __next__(self):
-        if self.fallback_activated:
-            return super().__next__()
-
-        data = next(self.iterator)
-        return {
-            "pixel_values": data[0],
-            "input_ids": data[1],
-            "attention_mask": data[2],
-            "return_loss": True,
-        }
diff --git a/examples/contrastive-image-text/habana_dataloader_trainer.py b/examples/contrastive-image-text/habana_dataloader_trainer.py
deleted file mode 100644
index b0c0fa3edf..0000000000
--- a/examples/contrastive-image-text/habana_dataloader_trainer.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A subclass of `GaudiTrainer` specific to habana dataloader
-"""
-
-from typing import Optional
-
-import datasets
-import torch
-from clip_mediapipe_dataloader import MediaApiDataLoader
-from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
-from transformers.trainer_pt_utils import (
-    DistributedLengthGroupedSampler,
-    DistributedSampler,
-    DistributedSamplerWithLoop,
-    LengthGroupedSampler,
-    ShardSampler,
-)
-from transformers.trainer_utils import has_length, seed_worker
-from transformers.utils import is_datasets_available
-
-from optimum.habana import GaudiTrainer
-
-
-class HabanaDataloaderTrainer(GaudiTrainer):
-    def get_train_dataloader(self) -> DataLoader:
-        """
-        Returns the training Habana Media Dataloader.
-        """
-        if self.train_dataset is None:
-            raise ValueError("Trainer: training requires a train_dataset.")
-
-        train_dataset = self.train_dataset
-        data_collator = self.data_collator
-
-        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
-            train_dataset = self._remove_unused_columns(train_dataset, description="training")
-        else:
-            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")
-
-        dataloader_params = {
-            "batch_size": self._train_batch_size,
-            "collate_fn": data_collator,
-            "num_workers": self.args.dataloader_num_workers,
-            "pin_memory": self.args.dataloader_pin_memory,
-        }
-
-        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
-            dataloader_params["sampler"] = self._get_train_sampler()
-            dataloader_params["drop_last"] = self.args.dataloader_drop_last
-            dataloader_params["worker_init_fn"] = seed_worker
-
-        return MediaApiDataLoader(train_dataset, **dataloader_params)
-
-    def get_eval_dataloader(self, eval_dataset: Optional[datasets.Dataset] = None) -> DataLoader:
-        """
-        Returns the eval Habana Media Dataloader.
-        """
-        if eval_dataset is None and self.eval_dataset is None:
-            raise ValueError("Trainer: evaluation requires an eval_dataset.")
-        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
-        data_collator = self.data_collator
-
-        if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
-            eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation")
-        else:
-            data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation")
-
-        dataloader_params = {
-            "batch_size": self.args.eval_batch_size,
-            "collate_fn": data_collator,
-            "num_workers": self.args.dataloader_num_workers,
-            "pin_memory": self.args.dataloader_pin_memory,
-        }
-
-        if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
-            dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
-            dataloader_params["drop_last"] = True
-
-        return MediaApiDataLoader(eval_dataset, **dataloader_params)
-
-    def get_test_dataloader(self, test_dataset: datasets.Dataset) -> DataLoader:
-        """
-        Returns the test Habana Media Dataloader.
-        """
-        data_collator = self.data_collator
-
-        if is_datasets_available() and isinstance(test_dataset, datasets.Dataset):
-            test_dataset = self._remove_unused_columns(test_dataset, description="test")
-        else:
-            data_collator = self._get_collator_with_removed_columns(data_collator, description="test")
-
-        dataloader_params = {
-            "batch_size": self.args.eval_batch_size,
-            "collate_fn": data_collator,
-            "num_workers": self.args.dataloader_num_workers,
-            "pin_memory": self.args.dataloader_pin_memory,
-        }
-
-        if not isinstance(test_dataset, torch.utils.data.IterableDataset):
-            dataloader_params["sampler"] = self._get_eval_sampler(test_dataset)
-            dataloader_params["drop_last"] = True
-
-        # We use the same batch_size as for eval.
-        return MediaApiDataLoader(test_dataset, **dataloader_params)
-
-    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
-        """
-        Copied from: https://github.com/huggingface/optimum-habana/blob/v1.6.1/optimum/habana/transformers/trainer.py#L257
-        `_get_train_sampler` from Transformers v4.31 does not work with distributed runs using the media pipe.
-        Probably because a `DistributedSampler` is not used there.
-        """
-        if self.train_dataset is None or not has_length(self.train_dataset):
-            return None
-
-        generator = None
-        if self.args.world_size <= 1:
-            generator = torch.Generator()
-            # for backwards compatibility, we generate a seed here (which is sampled from a generator seeded with
-            # `args.seed`) if data_seed isn't provided.
-            # Further on in this method, we default to `args.seed` instead.
-            if self.args.data_seed is None:
-                seed = int(torch.empty((), dtype=torch.int64).random_().item())
-            else:
-                seed = self.args.data_seed
-            generator.manual_seed(seed)
-
-        seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed
-
-        # Build the sampler.
-        if self.args.group_by_length:
-            if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset):
-                lengths = (
-                    self.train_dataset[self.args.length_column_name]
-                    if self.args.length_column_name in self.train_dataset.column_names
-                    else None
-                )
-            else:
-                lengths = None
-            model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
-            if self.args.world_size <= 1:
-                return LengthGroupedSampler(
-                    self.args.train_batch_size * self.args.gradient_accumulation_steps,
-                    dataset=self.train_dataset,
-                    lengths=lengths,
-                    model_input_name=model_input_name,
-                    generator=generator,
-                )
-            else:
-                return DistributedLengthGroupedSampler(
-                    self.args.train_batch_size * self.args.gradient_accumulation_steps,
-                    dataset=self.train_dataset,
-                    num_replicas=self.args.world_size,
-                    rank=self.args.process_index,
-                    lengths=lengths,
-                    model_input_name=model_input_name,
-                    seed=seed,
-                )
-
-        else:
-            if self.args.world_size <= 1:
-                num_samples = len(self.train_dataset)
-                if (
-                    self.args.use_lazy_mode
-                    and not self.args.dataloader_drop_last
-                    and len(self.train_dataset) % self.args.per_device_train_batch_size != 0
-                ):
-                    # Make the total number of samples divisible by the batch size in lazy mode if needed
-                    num_samples += (
-                        self.args.per_device_train_batch_size
-                        - len(self.train_dataset) % self.args.per_device_train_batch_size
-                    )
-                return RandomSampler(self.train_dataset, num_samples=num_samples, generator=generator)
-            else:
-                if self.args.use_lazy_mode and not self.args.dataloader_drop_last:
-                    # Use a loop for HPUs when drop_last is False to have all batches have the same size
-                    return DistributedSamplerWithLoop(
-                        self.train_dataset,
-                        batch_size=self.args.per_device_train_batch_size,
-                        num_replicas=self.args.world_size,
-                        rank=self.args.process_index,
-                        seed=seed,
-                    )
-                else:
-                    return DistributedSampler(
-                        self.train_dataset,
-                        num_replicas=self.args.world_size,
-                        rank=self.args.process_index,
-                        seed=seed,
-                    )
-
-    def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]:
-        """
-        Copied from; https://github.com/huggingface/transformers/blob/v4.28.1/src/transformers/trainer.py#L918
-        `_get_eval_sampler` from Transformers v4.31 may return `None` which breaks the media pipe.
-        """
-        if self.args.world_size <= 1:
-            return SequentialSampler(eval_dataset)
-        else:
-            return ShardSampler(
-                eval_dataset,
-                batch_size=self.args.per_device_eval_batch_size,
-                num_processes=self.args.world_size,
-                process_index=self.args.process_index,
-            )
diff --git a/examples/contrastive-image-text/requirements.txt b/examples/contrastive-image-text/requirements.txt
deleted file mode 100644
index d1fff8c979..0000000000
--- a/examples/contrastive-image-text/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-datasets >= 1.8.0, <= 2.19.2
diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py
deleted file mode 100644
index c8932c8fbf..0000000000
--- a/examples/contrastive-image-text/run_bridgetower.py
+++ /dev/null
@@ -1,637 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Training BridgeTower with a contrastive text-image loss.
-"""
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import datasets
-import torch
-import transformers
-from datasets import load_dataset
-from habana_dataloader_trainer import HabanaDataloaderTrainer
-from torchvision.io import ImageReadMode, read_image
-from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
-from torchvision.transforms.functional import InterpolationMode, to_grayscale, to_tensor
-from transformers import (
-    AutoImageProcessor,
-    AutoTokenizer,
-    HfArgumentParser,
-)
-from transformers.models.bridgetower.modeling_bridgetower import BridgeTowerForContrastiveLearning
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-from PIL import Image
-import io
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    freeze_vision_model: bool = field(
-        default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
-    )
-    freeze_text_model: bool = field(
-        default=False, metadata={"help": "Whether to freeze the text model parameters or not."}
-    )
-    freeze_text_pooler: bool = field(
-        default=True, metadata={"help": "Whether to freeze the text pooler parameters or not."}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    dataset_revision: str = field(
-        default="main",
-        metadata={"help": "The specific dataset version to use (can be a branch name, tag name or commit id)."},
-    )
-    data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory containing input files."})
-    image_column: Optional[str] = field(
-        default="image_path",
-        metadata={"help": "The name of the column in the datasets containing the full image file paths."},
-    )
-    caption_column: Optional[str] = field(
-        default="caption",
-        metadata={"help": "The name of the column in the datasets containing the image captions."},
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a jsonlines file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input testing data file (a jsonlines file)."},
-    )
-    max_seq_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    mediapipe_dataloader: bool = field(
-        default=False, metadata={"help": "Turn on MediaPipe hardware-based accelerated data loading."}
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension == "json", "`validation_file` should be a json file."
-
-
-dataset_name_mapping = {
-    "image_caption_dataset.py": ("image_path", "caption"),
-}
-
-
-# We use torchvision for faster image pre-processing. The transforms are implemented as nn.Module,
-# so we jit it to be faster.
-class Transform(torch.nn.Module):
-    def __init__(self, image_size, mean, std):
-        super().__init__()
-        self.transforms = torch.nn.Sequential(
-            Resize([image_size], interpolation=InterpolationMode.BICUBIC),
-            CenterCrop(image_size),
-            ConvertImageDtype(torch.float),
-            Normalize(mean, std),
-        )
-
-    def forward(self, x) -> torch.Tensor:
-        """`x` should be an instance of `PIL.Image.Image`"""
-        with torch.no_grad():
-            x = self.transforms(x)
-        return x
-
-
-def collate_fn(examples):
-    pixel_values = torch.stack([example["pixel_values"] for example in examples])
-    input_ids = torch.tensor([example["input_ids"] for example in examples], dtype=torch.long)
-    attention_mask = torch.tensor([example["attention_mask"] for example in examples], dtype=torch.long)
-    return {
-        "pixel_values": pixel_values,
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "return_loss": True,
-    }
-
-
-def main():
-    # 1. Parse input arguments
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_bridgetower", model_args, data_args)
-
-    # 2. Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # 3. Detecting last checkpoint and eventually continue from last checkpoint
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # 4. Load dataset
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files this script will use the first column for the full image path and the second column for the
-    # captions (unless you specify column names for this with the `image_column` and `caption_column` arguments).
-    #
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            keep_in_memory=False,
-            data_dir=data_args.data_dir,
-            token=model_args.token,
-            revision=data_args.dataset_revision,
-        )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        dataset = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    if data_args.mediapipe_dataloader and "image_path" not in dataset["train"].column_names:
-        dataset = dataset.cast_column(data_args.image_column, datasets.Image(decode=False))
-
-    # 5. Load pretrained model, tokenizer, and image processor
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name,
-            cache_dir=model_args.cache_dir,
-            use_fast=model_args.use_fast_tokenizer,
-            token=model_args.token,
-            trust_remote_code=model_args.trust_remote_code,
-        )
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=model_args.cache_dir,
-            use_fast=model_args.use_fast_tokenizer,
-            token=model_args.token,
-            trust_remote_code=model_args.trust_remote_code,
-        )
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    # Load image_processor, in this script we only use this to get the mean and std for normalization.
-    image_processor = AutoImageProcessor.from_pretrained(
-        model_args.image_processor_name or model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    model = BridgeTowerForContrastiveLearning.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    config = model.config
-
-    def _freeze_params(module):
-        for param in module.parameters():
-            param.requires_grad = False
-
-    if model_args.freeze_text_pooler:
-        model.bridgetower.text_model.pooler = None
-
-    if model_args.freeze_vision_model:
-        _freeze_params(model.vision_model)
-
-    if model_args.freeze_text_model:
-        _freeze_params(model.text_model)
-
-    # set seed for torch dataloaders
-    set_seed(training_args.seed)
-
-    # Create validation and test splits if they don't exist yet
-    if "test" not in dataset:
-        dataset = dataset["train"].train_test_split(test_size=0.4, shuffle=True, seed=training_args.seed)
-        if "validation" not in dataset:
-            buffer_dataset = dataset["test"].train_test_split(test_size=0.5, shuffle=False)
-            dataset["validation"] = buffer_dataset["train"]
-            dataset["test"] = buffer_dataset["test"]
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    if training_args.do_train:
-        column_names = dataset["train"].column_names
-    elif training_args.do_eval:
-        column_names = dataset["validation"].column_names
-    elif training_args.do_predict:
-        column_names = dataset["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-
-    # 6. Get the column names for input/target.
-    dataset_columns = dataset_name_mapping.get(data_args.dataset_name, None)
-    if data_args.image_column is None:
-        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        image_column = data_args.image_column
-        if image_column not in column_names:
-            raise ValueError(
-                f"--image_column' value '{data_args.image_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if data_args.caption_column is None:
-        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        caption_column = data_args.caption_column
-        if caption_column not in column_names:
-            raise ValueError(
-                f"--caption_column' value '{data_args.caption_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    # 7. Preprocessing the datasets.
-    # Initialize torchvision transforms and jit it for faster processing.
-    image_transformations = Transform(
-        config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
-    )
-
-    # Preprocessing the datasets.
-    # We need to tokenize input captions and transform the images.
-    def tokenize_captions(examples):
-        captions = list(examples[caption_column])
-        text_inputs = tokenizer(captions, max_length=data_args.max_seq_length, padding="max_length", truncation=True)
-        examples["input_ids"] = text_inputs.input_ids
-        examples["attention_mask"] = text_inputs.attention_mask
-        return examples
-
-    def get_image(image_or_path):
-        if isinstance(image_or_path, str):
-            # If the argument is a path to an image file, read it
-            return read_image(image_or_path, mode=ImageReadMode.RGB)
-        elif isinstance(image_or_path, Image.Image):
-            if len(image_or_path.getbands()) == 1:
-                image_or_path = to_grayscale(image_or_path, num_output_channels=3)
-            return to_tensor(image_or_path)
-
-        return None
-
-    def transform_images(examples):
-        images = []
-
-        for item in examples[image_column]:
-            # Manage the case where images are a dictionary with keys 'bytes' and 'path'
-            if isinstance(item, dict):
-                encoding = 'ISO-8859-1'
-                s = item['bytes'].decode(encoding)
-                b = bytearray(s, encoding)
-                image = Image.open(io.BytesIO(b)).convert("RGB")
-                images.append(to_tensor(image))
-            else:
-                images.append(get_image(item))
-
-        examples["pixel_values"] = [image_transformations(image) for image in images]
-
-        return examples
-
-    if training_args.do_train:
-        if "train" not in dataset:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = dataset["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-        train_dataset = train_dataset.map(
-            function=tokenize_captions,
-            batched=True,
-            remove_columns=[col for col in column_names if col != image_column],
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
-
-        if data_args.mediapipe_dataloader:
-            train_dataset.image_mean = image_processor.image_mean
-            train_dataset.image_std = image_processor.image_std
-            train_dataset.text_max_length = data_args.max_seq_length
-            train_dataset.image_resize = config.vision_config.image_size
-            train_dataset.transform_func = transform_images
-        else:
-            # Transform images on the fly as doing it on the whole dataset takes too much time.
-            train_dataset.set_transform(transform_images)
-
-    if training_args.do_eval:
-        if "validation" not in dataset:
-            raise ValueError("--do_eval requires a train validation")
-        eval_dataset = dataset["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-        eval_dataset = eval_dataset.map(
-            function=tokenize_captions,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[col for col in column_names if col != image_column],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
-
-        if data_args.mediapipe_dataloader:
-            eval_dataset.image_mean = image_processor.image_mean
-            eval_dataset.image_std = image_processor.image_std
-            eval_dataset.text_max_length = data_args.max_seq_length
-            eval_dataset.image_resize = config.vision_config.image_size
-            eval_dataset.transform_func = transform_images
-        else:
-            # Transform images on the fly as doing it on the whole dataset takes too much time.
-            eval_dataset.set_transform(transform_images)
-
-    if training_args.do_predict:
-        if "test" not in dataset:
-            raise ValueError("--do_predict requires a test dataset")
-        test_dataset = dataset["test"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(test_dataset), data_args.max_eval_samples)
-            test_dataset = test_dataset.select(range(max_eval_samples))
-
-        test_dataset = test_dataset.map(
-            function=tokenize_captions,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[col for col in column_names if col != image_column],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on test dataset",
-        )
-
-        if data_args.mediapipe_dataloader:
-            test_dataset.image_mean = image_processor.image_mean
-            test_dataset.image_std = image_processor.image_std
-            test_dataset.text_max_length = data_args.max_seq_length
-            test_dataset.image_resize = config.vision_config.image_size
-            test_dataset.transform_func = transform_images
-        else:
-            # Transform images on the fly as doing it on the whole dataset takes too much time.
-            test_dataset.set_transform(transform_images)
-
-    # 8. Initialize our trainer
-    trainer_cls = HabanaDataloaderTrainer if data_args.mediapipe_dataloader else GaudiTrainer
-    trainer = trainer_cls(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        data_collator=collate_fn,
-    )
-
-    # 9. Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-        tokenizer.save_pretrained(training_args.output_dir)
-        image_processor.save_pretrained(training_args.output_dir)
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-
-    # 10. Evaluation
-    if training_args.do_eval:
-        metrics = trainer.evaluate()
-        trainer.log_metrics("validation", metrics)
-        trainer.save_metrics("validation", metrics)
-
-    # 11. Test
-    if training_args.do_predict:
-        metrics = trainer.evaluate(eval_dataset=test_dataset)
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
-
-    # 12. Write Training Stats and push to hub.
-    finetuned_from = model_args.model_name_or_path
-    # If from a local directory, don't set `finetuned_from` as this is required to be a valid repo. id on the Hub.
-    if os.path.isdir(finetuned_from):
-        finetuned_from = None
-    kwargs = {"finetuned_from": finetuned_from, "tasks": "contrastive-image-text-modeling"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py
deleted file mode 100644
index 8d3b3a28a5..0000000000
--- a/examples/contrastive-image-text/run_clip.py
+++ /dev/null
@@ -1,610 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Training a CLIP like dual encoder models using text and vision encoders in the library.
-The script can be used to train CLIP like models for languages other than English by using
-a text encoder pre-trained in the desired language. Currently this script supports the following vision
-and text models:
-Vision models: ViT(https://huggingface.co/models?filter=vit), CLIP (https://huggingface.co/models?filter=clip)
-Text models: BERT, ROBERTa (https://huggingface.co/models?filter=fill-mask)
-"""
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-import transformers
-from datasets import load_dataset
-from habana_dataloader_trainer import HabanaDataloaderTrainer
-from PIL import Image
-from torchvision.io import ImageReadMode, read_image
-from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
-from torchvision.transforms.functional import InterpolationMode
-from transformers import (
-    AutoImageProcessor,
-    AutoModel,
-    AutoTokenizer,
-    HfArgumentParser,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    freeze_vision_model: bool = field(
-        default=False, metadata={"help": "Whether to freeze the vision model parameters or not."}
-    )
-    freeze_text_model: bool = field(
-        default=False, metadata={"help": "Whether to freeze the text model parameters or not."}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory containing input files."})
-    image_column: Optional[str] = field(
-        default="image_path",
-        metadata={"help": "The name of the column in the datasets containing the full image file paths."},
-    )
-    caption_column: Optional[str] = field(
-        default="caption",
-        metadata={"help": "The name of the column in the datasets containing the image captions."},
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a jsonlines file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input testing data file (a jsonlines file)."},
-    )
-    max_seq_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    mediapipe_dataloader: bool = field(
-        default=False, metadata={"help": "Turn on MediaPipe hardware-based accelerated data loading."}
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension == "json", "`validation_file` should be a json file."
-
-
-dataset_name_mapping = {
-    "image_caption_dataset.py": ("image_path", "caption"),
-}
-
-
-# We use torchvision for faster image pre-processing. The transforms are implemented as nn.Module,
-# so we jit it to be faster.
-class Transform(torch.nn.Module):
-    def __init__(self, image_size, mean, std):
-        super().__init__()
-        self.transforms = torch.nn.Sequential(
-            Resize([image_size], interpolation=InterpolationMode.BICUBIC),
-            CenterCrop(image_size),
-            ConvertImageDtype(torch.float),
-            Normalize(mean, std),
-        )
-
-    def forward(self, x) -> torch.Tensor:
-        """`x` should be an instance of `PIL.Image.Image`"""
-        with torch.no_grad():
-            x = self.transforms(x)
-        return x
-
-
-def collate_fn(examples):
-    pixel_values = torch.stack([example["pixel_values"] for example in examples])
-    input_ids = torch.tensor([example["input_ids"] for example in examples], dtype=torch.long)
-    attention_mask = torch.tensor([example["attention_mask"] for example in examples], dtype=torch.long)
-    return {
-        "pixel_values": pixel_values,
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "return_loss": True,
-    }
-
-
-def main():
-    # 1. Parse input arguments
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_clip", model_args, data_args)
-
-    # 2. Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # 3. Detecting last checkpoint and eventually continue from last checkpoint
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # 4. Load dataset
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files this script will use the first column for the full image path and the second column for the
-    # captions (unless you specify column names for this with the `image_column` and `caption_column` arguments).
-    #
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            keep_in_memory=False,
-            data_dir=data_args.data_dir,
-            token=model_args.token,
-        )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        dataset = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # 5. Load pretrained model, tokenizer, and image processor
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name,
-            cache_dir=model_args.cache_dir,
-            use_fast=model_args.use_fast_tokenizer,
-            token=model_args.token,
-            trust_remote_code=model_args.trust_remote_code,
-        )
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=model_args.cache_dir,
-            use_fast=model_args.use_fast_tokenizer,
-            token=model_args.token,
-            trust_remote_code=model_args.trust_remote_code,
-        )
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    # Load image_processor, in this script we only use this to get the mean and std for normalization.
-    image_processor = AutoImageProcessor.from_pretrained(
-        model_args.image_processor_name or model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    model = AutoModel.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    config = model.config
-
-    def _freeze_params(module):
-        for param in module.parameters():
-            param.requires_grad = False
-
-    if model_args.freeze_vision_model:
-        _freeze_params(model.vision_model)
-
-    if model_args.freeze_text_model:
-        _freeze_params(model.text_model)
-
-    # set seed for torch dataloaders
-    set_seed(training_args.seed)
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    if training_args.do_train:
-        column_names = dataset["train"].column_names
-    elif training_args.do_eval:
-        column_names = dataset["validation"].column_names
-    elif training_args.do_predict:
-        column_names = dataset["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-
-    # 6. Get the column names for input/target.
-    dataset_columns = dataset_name_mapping.get(data_args.dataset_name, None)
-    if data_args.image_column is None:
-        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        image_column = data_args.image_column
-        if image_column not in column_names:
-            raise ValueError(
-                f"--image_column' value '{data_args.image_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if data_args.caption_column is None:
-        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        caption_column = data_args.caption_column
-        if caption_column not in column_names:
-            raise ValueError(
-                f"--caption_column' value '{data_args.caption_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    # 7. Preprocessing the datasets.
-    # Initialize torchvision transforms and jit it for faster processing.
-    image_transformations = Transform(
-        config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
-    )
-
-    # Preprocessing the datasets.
-    # We need to tokenize input captions and transform the images.
-    def tokenize_captions(examples):
-        captions = list(examples[caption_column])
-        text_inputs = tokenizer(captions, max_length=data_args.max_seq_length, padding="max_length", truncation=True)
-        examples["input_ids"] = text_inputs.input_ids
-        examples["attention_mask"] = text_inputs.attention_mask
-        return examples
-
-    def transform_images(examples):
-        images = [read_image(image_file, mode=ImageReadMode.RGB) for image_file in examples[image_column]]
-        examples["pixel_values"] = [image_transformations(image) for image in images]
-        return examples
-
-    def filter_corrupt_images(examples):
-        """remove problematic images"""
-        valid_images = []
-        for image_file in examples[image_column]:
-            try:
-                Image.open(image_file)
-                valid_images.append(True)
-            except Exception:
-                valid_images.append(False)
-        return valid_images
-
-    if training_args.do_train:
-        if "train" not in dataset:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = dataset["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-        train_dataset = train_dataset.filter(
-            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
-        )
-        train_dataset = train_dataset.map(
-            function=tokenize_captions,
-            batched=True,
-            remove_columns=[col for col in column_names if col != image_column],
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
-
-        if data_args.mediapipe_dataloader:
-            train_dataset.image_mean = image_processor.image_mean
-            train_dataset.image_std = image_processor.image_std
-            train_dataset.text_max_length = data_args.max_seq_length
-            train_dataset.image_resize = config.vision_config.image_size
-            train_dataset.transform_func = transform_images
-        else:
-            # Transform images on the fly as doing it on the whole dataset takes too much time.
-            train_dataset.set_transform(transform_images)
-
-    if training_args.do_eval:
-        if "validation" not in dataset:
-            raise ValueError("--do_eval requires a train validation")
-        eval_dataset = dataset["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-        eval_dataset = eval_dataset.filter(
-            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
-        )
-        eval_dataset = eval_dataset.map(
-            function=tokenize_captions,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[col for col in column_names if col != image_column],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
-
-        if data_args.mediapipe_dataloader:
-            eval_dataset.image_mean = image_processor.image_mean
-            eval_dataset.image_std = image_processor.image_std
-            eval_dataset.text_max_length = data_args.max_seq_length
-            eval_dataset.image_resize = config.vision_config.image_size
-            eval_dataset.transform_func = transform_images
-        else:
-            # Transform images on the fly as doing it on the whole dataset takes too much time.
-            eval_dataset.set_transform(transform_images)
-
-    if training_args.do_predict:
-        if "test" not in dataset:
-            raise ValueError("--do_predict requires a test dataset")
-        test_dataset = dataset["test"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(test_dataset), data_args.max_eval_samples)
-            test_dataset = test_dataset.select(range(max_eval_samples))
-
-        test_dataset = test_dataset.filter(
-            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
-        )
-        test_dataset = test_dataset.map(
-            function=tokenize_captions,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[col for col in column_names if col != image_column],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on test dataset",
-        )
-
-        # Transform images on the fly as doing it on the whole dataset takes too much time.
-        test_dataset.set_transform(transform_images)
-        if data_args.mediapipe_dataloader:
-            test_dataset.image_mean = image_processor.image_mean
-            test_dataset.image_std = image_processor.image_std
-            test_dataset.text_max_length = data_args.max_seq_length
-            test_dataset.image_resize = config.vision_config.image_size
-            test_dataset.transform_func = transform_images
-        else:
-            # Transform images on the fly as doing it on the whole dataset takes too much time.
-            test_dataset.set_transform(transform_images)
-
-    # 8. Initialize our trainer
-    trainer_cls = HabanaDataloaderTrainer if data_args.mediapipe_dataloader else GaudiTrainer
-    trainer = trainer_cls(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        data_collator=collate_fn,
-    )
-
-    # 9. Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-        tokenizer.save_pretrained(training_args.output_dir)
-        image_processor.save_pretrained(training_args.output_dir)
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-
-    # 10. Evaluation
-    if training_args.do_eval:
-        metrics = trainer.evaluate()
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # 11. Write Training Stats and push to hub.
-    finetuned_from = model_args.model_name_or_path
-    # If from a local directory, don't set `finetuned_from` as this is required to be a valid repo. id on the Hub.
-    if os.path.isdir(finetuned_from):
-        finetuned_from = None
-    kwargs = {"finetuned_from": finetuned_from, "tasks": "contrastive-image-text-modeling"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/gaudi_spawn.py b/examples/gaudi_spawn.py
deleted file mode 100644
index 8896e0a14d..0000000000
--- a/examples/gaudi_spawn.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A simple launcher script for distributed training on HPUs.
-
-Single node:
-::
-    >>> python gaudi_spawn.py --world_size=NUM_CARDS_YOU_HAVE --use_mpi
-               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
-               arguments of your training script)
-
-Multi node:
-::
-    >>> python gaudi_spawn.py --hostfile=PATH_TO_HOSTFILE --use_deepspeed
-               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
-               arguments of your training script)
-"""
-
-import sys
-from argparse import REMAINDER, ArgumentParser
-
-from optimum.habana.distributed import DistributedRunner
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def parse_args():
-    """
-    Helper function parsing the command line options.
-    @retval ArgumentParser
-    """
-    parser = ArgumentParser(
-        description=(
-            "Habana Gaudi distributed training launch helper utility that will spawn up multiple distributed"
-            " processes."
-        )
-    )
-
-    # Optional arguments for the launch helper
-    parser.add_argument("--world_size", type=int, default=1, help="Number of HPUs to use (1 or 8)")
-    parser.add_argument("--hostfile", type=str, default=None, help="Path to the file where hosts are specified.")
-    parser.add_argument("--use_mpi", action="store_true", help="Use MPI for distributed training")
-    parser.add_argument("--use_deepspeed", action="store_true", help="Use DeepSpeed for distributed training")
-    parser.add_argument("--master_port", type=int, default=29500, help="Master port used by DeepSpeed and MPI")
-
-    # positional
-    parser.add_argument(
-        "training_script",
-        type=str,
-        help=(
-            "The full path to the single HPU training "
-            "program/script to be launched in parallel, "
-            "followed by all the arguments for the "
-            "training script."
-        ),
-    )
-
-    # rest from the training program
-    parser.add_argument("training_script_args", nargs=REMAINDER)
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    if args.use_deepspeed:
-        from transformers.integrations.deepspeed import is_deepspeed_available
-
-        if not is_deepspeed_available():
-            raise ImportError(
-                "--use_deepspeed requires deepspeed: `pip install"
-                " git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0`."
-            )
-
-    # Patch sys.argv
-    sys.argv = [args.training_script] + args.training_script_args
-    # Handle the case where arguments contain whitespaces
-    argv = ['"{}"'.format(arg) if " " in arg and arg[0] != '"' and arg[-1] != '"' else arg for arg in sys.argv]
-    command_list = [" ".join(argv)]
-
-    distributed_runner = DistributedRunner(
-        command_list=command_list,
-        world_size=args.world_size,
-        hostfile=args.hostfile,
-        use_mpi=args.use_mpi,
-        use_deepspeed=args.use_deepspeed,
-        master_port=args.master_port,
-    )
-
-    ret_code = distributed_runner.run()
-    sys.exit(ret_code)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/image-classification/README.md b/examples/image-classification/README.md
deleted file mode 100644
index 642cf427df..0000000000
--- a/examples/image-classification/README.md
+++ /dev/null
@@ -1,297 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Image Classification Examples
-
-This directory contains a script that showcases how to fine-tune any model supported by the [`AutoModelForImageClassification` API](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForImageClassification) (such as [ViT](https://huggingface.co/docs/transformers/main/en/model_doc/vit) or [Swin Transformer](https://huggingface.co/docs/transformers/main/en/model_doc/swin)) on HPUs. They can be used to fine-tune models on both [datasets from the hub](#using-datasets-from-hub) as well as on [your own custom data](#using-your-own-data).
-
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Single-HPU training
-
-### Using datasets from Hub
-
-Here we show how to fine-tune a Vision Transformer (`ViT`) on Cifar10:
-
-```bash
-python run_image_classification.py \
-    --model_name_or_path google/vit-base-patch16-224-in21k \
-    --dataset_name cifar10 \
-    --output_dir /tmp/outputs/ \
-    --remove_unused_columns False \
-    --image_column_name img \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 5 \
-    --per_device_train_batch_size 64 \
-    --per_device_eval_batch_size 64 \
-    --evaluation_strategy epoch \
-    --save_strategy epoch \
-    --load_best_model_at_end True \
-    --save_total_limit 3 \
-    --seed 1337 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/vit \
-    --throughput_warmup_steps 3 \
-    --dataloader_num_workers 1 \
-    --bf16
-```
-
-For Swin, you need to change/add the following arguments:
-- `--model_name_or_path microsoft/swin-base-patch4-window7-224-in22k`
-- `--gaudi_config_name Habana/swin`
-- `--ignore_mismatched_sizes`
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
-### Using your own data
-
-To use your own dataset, there are 2 ways:
-- you can either provide your own folders as `--train_dir` and/or `--validation_dir` arguments
-- you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument.
-
-Below, we explain both in more detail.
-
-#### Provide them as folders
-
-If you provide your own folders with images, the script expects the following directory structure:
-
-```bash
-root/dog/xxx.png
-root/dog/xxy.png
-root/dog/[...]/xxz.png
-
-root/cat/123.png
-root/cat/nsdf3.png
-root/cat/[...]/asd932_.png
-```
-
-In other words, you need to organize your images in subfolders, based on their class. You can then run the script like this:
-
-```bash
-python run_image_classification.py \
-    --model_name_or_path google/vit-base-patch16-224-in21k \
-    --train_dir <path-to-train-root> \
-    --output_dir /tmp/outputs/ \
-    --remove_unused_columns False \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/vit \
-    --throughput_warmup_steps 3 \
-    --dataloader_num_workers 1 \
-    --bf16
-```
-
-Internally, the script will use the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature which will automatically turn the folders into 🤗 Dataset objects.
-
-##### 💡 The above will split the train dir into training and evaluation sets
-  - To control the split amount, use the `--train_val_split` flag.
-  - To provide your own validation split in its own directory, you can pass the `--validation_dir <path-to-val-root>` flag.
-
-#### Upload your data to the hub, as a (possibly private) repo
-
-It's very easy (and convenient) to upload your image dataset to the hub using the [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) feature available in 🤗 Datasets. Simply do the following:
-
-```python
-from datasets import load_dataset
-
-# example 1: local folder
-dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
-
-# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
-dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
-
-# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
-dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
-
-# example 4: providing several splits
-dataset = load_dataset("imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]})
-```
-
-`ImageFolder` will create a `label` column, and the label name is based on the directory name.
-
-Next, push it to the hub!
-
-```python
-# assuming you have ran the huggingface-cli login command in a terminal
-dataset.push_to_hub("name_of_your_dataset")
-
-# if you want to push to a private repo, simply pass private=True:
-dataset.push_to_hub("name_of_your_dataset", private=True)
-```
-
-and that's it! You can now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the hub (as explained in [Using datasets from the 🤗 hub](#using-datasets-from-hub)).
-
-More on this can also be found in [this blog post](https://huggingface.co/blog/image-search-datasets).
-
-### Sharing your model on 🤗 Hub
-
-0. If you haven't already, [sign up](https://huggingface.co/join) for a 🤗 account.
-
-1. Make sure you have `git-lfs` installed and git set up.
-
-```bash
-$ apt install git-lfs
-$ git config --global user.email "you@example.com"
-$ git config --global user.name "Your Name"
-```
-
-2. Log in with your HuggingFace account credentials using `huggingface-cli`:
-
-```bash
-$ huggingface-cli login
-# ...follow the prompts
-```
-
-3. When running the script, pass the following arguments:
-
-```bash
-python run_image_classification.py \
-    --push_to_hub \
-    --push_to_hub_model_id <name-your-model> \
-    ...
-```
-
-
-## Multi-HPU training
-
-Here is how you would fine-tune ViT on Cifar10 using 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_image_classification.py \
-    --model_name_or_path google/vit-base-patch16-224-in21k \
-    --dataset_name cifar10 \
-    --output_dir /tmp/outputs/ \
-    --remove_unused_columns False \
-    --image_column_name img \
-    --do_train \
-    --do_eval \
-    --learning_rate 2e-4 \
-    --num_train_epochs 5 \
-    --per_device_train_batch_size 64 \
-    --per_device_eval_batch_size 64 \
-    --evaluation_strategy epoch \
-    --save_strategy epoch \
-    --load_best_model_at_end True \
-    --save_total_limit 3 \
-    --seed 1337 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/vit \
-    --throughput_warmup_steps 3 \
-    --dataloader_num_workers 1 \
-    --bf16
-```
-
-For Swin, you need to change/add the following arguments:
-- `--model_name_or_path microsoft/swin-base-patch4-window7-224-in22k`
-- `--gaudi_config_name Habana/swin`
-- `--ignore_mismatched_sizes`
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
-## Using DeepSpeed
-
-Similarly to multi-HPU training, here is how you would fine-tune ViT on Cifar10 using 8 HPUs with DeepSpeed:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_image_classification.py \
-    --model_name_or_path google/vit-base-patch16-224-in21k \
-    --dataset_name cifar10 \
-    --output_dir /tmp/outputs/ \
-    --remove_unused_columns False \
-    --image_column_name img \
-    --do_train \
-    --do_eval \
-    --learning_rate 2e-4 \
-    --num_train_epochs 5 \
-    --per_device_train_batch_size 64 \
-    --per_device_eval_batch_size 64 \
-    --evaluation_strategy epoch \
-    --save_strategy epoch \
-    --load_best_model_at_end True \
-    --save_total_limit 3 \
-    --seed 1337 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/vit \
-    --throughput_warmup_steps 3 \
-    --dataloader_num_workers 1 \
-    --deepspeed path_to_my_deepspeed_config
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with ViT on Cifar10 on 1 Gaudi card with the following command:
-```bash
-python run_image_classification.py \
-    --model_name_or_path google/vit-base-patch16-224-in21k \
-    --dataset_name cifar10 \
-    --output_dir /tmp/outputs/ \
-    --remove_unused_columns False \
-    --image_column_name img \
-    --do_eval \
-    --per_device_eval_batch_size 64 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/vit \
-    --dataloader_num_workers 1 \
-    --bf16
diff --git a/examples/image-classification/requirements.txt b/examples/image-classification/requirements.txt
deleted file mode 100644
index 0267eda771..0000000000
--- a/examples/image-classification/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-torch >= 1.5.0, <= 2.4.0
-torchvision >= 0.6.0, <= 0.18.1
-datasets >= 2.14.0, <= 2.19.2
-evaluate == 0.4.2
-scikit-learn == 1.5.0
diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py
deleted file mode 100644
index 9f25269b4b..0000000000
--- a/examples/image-classification/run_image_classification.py
+++ /dev/null
@@ -1,453 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fine-tuning a 🤗 Transformers model for image classification"""
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import evaluate
-import numpy as np
-import torch
-import transformers
-from datasets import load_dataset
-from PIL import Image
-from torchvision.transforms import (
-    CenterCrop,
-    Compose,
-    Lambda,
-    Normalize,
-    RandomHorizontalFlip,
-    RandomResizedCrop,
-    Resize,
-    ToTensor,
-)
-from transformers import (
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-    AutoConfig,
-    AutoImageProcessor,
-    AutoModelForImageClassification,
-    HfArgumentParser,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
-
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-def pil_loader(path: str):
-    with open(path, "rb") as f:
-        im = Image.open(f)
-        return im.convert("RGB")
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify
-    them on the command line.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)."
-        },
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."})
-    validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."})
-    train_val_split: Optional[float] = field(
-        default=0.15, metadata={"help": "Percent to split off of train for validation."}
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    image_column_name: str = field(
-        default="image",
-        metadata={"help": "The name of the dataset column containing the image data. Defaults to 'image'."},
-    )
-    label_column_name: str = field(
-        default="label",
-        metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'."},
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and (self.train_dir is None and self.validation_dir is None):
-            raise ValueError(
-                "You must specify either a dataset name from the hub or a train and/or validation directory."
-            )
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        default="google/vit-base-patch16-224-in21k",
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    ignore_mismatched_sizes: bool = field(
-        default=False,
-        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_image_classification", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Initialize our dataset and prepare it for the 'image-classification' task.
-    if data_args.dataset_name is not None:
-        dataset = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    else:
-        data_files = {}
-        if data_args.train_dir is not None:
-            data_files["train"] = os.path.join(data_args.train_dir, "**")
-        if data_args.validation_dir is not None:
-            data_files["validation"] = os.path.join(data_args.validation_dir, "**")
-        dataset = load_dataset(
-            "imagefolder",
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-        )
-
-    dataset_column_names = dataset["train"].column_names if "train" in dataset else dataset["validation"].column_names
-    if data_args.image_column_name not in dataset_column_names:
-        raise ValueError(
-            f"--image_column_name {data_args.image_column_name} not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--image_column_name` to the correct audio column - one of "
-            f"{', '.join(dataset_column_names)}."
-        )
-    if data_args.label_column_name not in dataset_column_names:
-        raise ValueError(
-            f"--label_column_name {data_args.label_column_name} not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--label_column_name` to the correct text column - one of "
-            f"{', '.join(dataset_column_names)}."
-        )
-
-    def collate_fn(examples):
-        pixel_values = torch.stack([example["pixel_values"] for example in examples])
-        labels = torch.tensor([example[data_args.label_column_name] for example in examples])
-        return {"pixel_values": pixel_values, "labels": labels}
-
-    # If we don't have a validation split, split off a percentage of train as validation.
-    data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
-    if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
-        split = dataset["train"].train_test_split(data_args.train_val_split)
-        dataset["train"] = split["train"]
-        dataset["validation"] = split["test"]
-
-    # Prepare label mappings.
-    # We'll include these in the model's config to get human readable labels in the Inference API.
-    labels = dataset["train"].features[data_args.label_column_name].names
-    label2id, id2label = {}, {}
-    for i, label in enumerate(labels):
-        label2id[label] = str(i)
-        id2label[str(i)] = label
-
-    # Load the accuracy metric from the datasets package
-    metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
-
-    # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
-    # predictions and label_ids field) and has to return a dictionary string to float.
-    def compute_metrics(p):
-        """Computes accuracy on a batch of predictions"""
-        return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name or model_args.model_name_or_path,
-        num_labels=len(labels),
-        label2id=label2id,
-        id2label=id2label,
-        finetuning_task="image-classification",
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    model = AutoModelForImageClassification.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
-    )
-    image_processor = AutoImageProcessor.from_pretrained(
-        model_args.image_processor_name or model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    # Define torchvision transforms to be applied to each image.
-    if "shortest_edge" in image_processor.size:
-        size = image_processor.size["shortest_edge"]
-    else:
-        size = (image_processor.size["height"], image_processor.size["width"])
-    normalize = (
-        Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
-        if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std")
-        else Lambda(lambda x: x)
-    )
-    _train_transforms = Compose(
-        [
-            RandomResizedCrop(size),
-            RandomHorizontalFlip(),
-            ToTensor(),
-            normalize,
-        ]
-    )
-    _val_transforms = Compose(
-        [
-            Resize(size),
-            CenterCrop(size),
-            ToTensor(),
-            normalize,
-        ]
-    )
-
-    def train_transforms(example_batch):
-        """Apply _train_transforms across a batch."""
-        example_batch["pixel_values"] = [
-            _train_transforms(pil_img.convert("RGB")) for pil_img in example_batch[data_args.image_column_name]
-        ]
-        return example_batch
-
-    def val_transforms(example_batch):
-        """Apply _val_transforms across a batch."""
-        example_batch["pixel_values"] = [
-            _val_transforms(pil_img.convert("RGB")) for pil_img in example_batch[data_args.image_column_name]
-        ]
-        return example_batch
-
-    if training_args.do_train:
-        if "train" not in dataset:
-            raise ValueError("--do_train requires a train dataset")
-        if data_args.max_train_samples is not None:
-            dataset["train"] = (
-                dataset["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples))
-            )
-        # Set the training transforms
-        dataset["train"].set_transform(train_transforms)
-
-    if training_args.do_eval:
-        if "validation" not in dataset:
-            raise ValueError("--do_eval requires a validation dataset")
-        if data_args.max_eval_samples is not None:
-            dataset["validation"] = (
-                dataset["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
-            )
-        # Set the validation transforms
-        dataset["validation"].set_transform(val_transforms)
-
-    # Initialize our trainer
-    trainer = GaudiTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=dataset["train"] if training_args.do_train else None,
-        eval_dataset=dataset["validation"] if training_args.do_eval else None,
-        compute_metrics=compute_metrics,
-        tokenizer=image_processor,
-        data_collator=collate_fn,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        metrics = trainer.evaluate()
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Write model card and (optionally) push to hub
-    kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
-        "tasks": "image-classification",
-        "dataset": data_args.dataset_name,
-        "tags": ["image-classification", "vision"],
-    }
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
deleted file mode 100644
index 7b41f870e8..0000000000
--- a/examples/image-to-text/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Image to Text Examples
-
-This directory contains a script that showcases how to use the Transformers pipeline API to run image to text task on HPUs.
-
-## Single-HPU inference
-
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path Salesforce/blip-image-captioning-large \
-    --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \
-    --use_hpu_graphs \
-    --bf16
-```
-Models that have been validated:
-  - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning)
-  - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large)
-  - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
-
-### Running with FP8
-
-Llava-1.5-7b and Llava-1.5-13b in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch.
-
-More information on enabling fp8 in SynapseAI is available here:
-https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
-
-Here is an example to measure the tensor quantization statistics on Llava-1.5-7b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
---model_name_or_path llava-hf/llava-1.5-7b-hf \
---image_path "https://llava-vl.github.io/static/images/view.jpg" \
---use_hpu_graphs \
---bf16
-```
-
-Here is an example to quantize the model based on previous measurements for Llava-1.5-7b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant.json python run_pipeline.py \
---model_name_or_path llava-hf/llava-1.5-7b-hf \
---image_path "https://llava-vl.github.io/static/images/view.jpg" \
---use_hpu_graphs \
---bf16
-```
diff --git a/examples/image-to-text/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json b/examples/image-to-text/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json
deleted file mode 100644
index 602a147baa..0000000000
--- a/examples/image-to-text/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
-    "dump_stats_path": "./hqt_output/measure",
-    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
-}
diff --git a/examples/image-to-text/quantization_config/maxabs_measure.json b/examples/image-to-text/quantization_config/maxabs_measure.json
deleted file mode 100644
index 3645fe743a..0000000000
--- a/examples/image-to-text/quantization_config/maxabs_measure.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "MEASURE",
-    "observer": "maxabs",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
-    "dump_stats_path": "./hqt_output/measure",
-    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
-}
\ No newline at end of file
diff --git a/examples/image-to-text/quantization_config/maxabs_measure_include_outputs.json b/examples/image-to-text/quantization_config/maxabs_measure_include_outputs.json
deleted file mode 100644
index 6de845a54d..0000000000
--- a/examples/image-to-text/quantization_config/maxabs_measure_include_outputs.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "MEASURE",
-    "observer": "maxabs",
-    "measure_exclude": "NONE",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
-    "dump_stats_path": "./hqt_output/measure",
-    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
-}
\ No newline at end of file
diff --git a/examples/image-to-text/quantization_config/maxabs_quant.json b/examples/image-to-text/quantization_config/maxabs_quant.json
deleted file mode 100644
index 02314a728e..0000000000
--- a/examples/image-to-text/quantization_config/maxabs_quant.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "maxabs_hw",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
-    "dump_stats_path": "./hqt_output/measure",
-    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
-}
\ No newline at end of file
diff --git a/examples/image-to-text/quantization_config/unit_scale_quant.json b/examples/image-to-text/quantization_config/unit_scale_quant.json
deleted file mode 100644
index caad4bb2a4..0000000000
--- a/examples/image-to-text/quantization_config/unit_scale_quant.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "unit_scale",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
-    "dump_stats_path": "./hqt_output/measure",
-    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
-}
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
deleted file mode 100644
index 9329408f26..0000000000
--- a/examples/image-to-text/run_pipeline.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import argparse
-import json
-import logging
-import os
-import time
-from pathlib import Path
-
-import PIL.Image
-import requests
-import torch
-from transformers import AutoConfig, pipeline
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        help="Path to pre-trained model",
-    )
-    parser.add_argument(
-        "--image_path",
-        default=None,
-        type=str,
-        nargs="*",
-        help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")',
-    )
-
-    parser.add_argument(
-        "--prompt",
-        default=None,
-        type=str,
-        help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")',
-    )
-    parser.add_argument(
-        "--use_hpu_graphs",
-        action="store_true",
-        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
-    )
-    parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Whether to perform generation in bf16 precision.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        help="Output directory to store results in.",
-    )
-    parser.add_argument(
-        "--token",
-        default=None,
-        type=str,
-        help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-        "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
-    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
-    args = parser.parse_args()
-
-    # set args.quant_config with env variable if it is set
-    args.quant_config = os.getenv("QUANT_CONFIG", "")
-
-    adapt_transformers_to_gaudi()
-
-    model_type = AutoConfig.from_pretrained(args.model_name_or_path).model_type
-    if args.image_path is None and model_type == "llava":
-        args.image_path = ["https://llava-vl.github.io/static/images/view.jpg"]
-    elif args.image_path is None and model_type == "llava_next":
-        args.image_path = [
-            "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
-        ]
-    if args.prompt is None and model_type == "llava":
-        args.prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
-    elif args.prompt is None and model_type == "llava_next":
-        args.prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
-        if args.model_name_or_path == "llava-hf/llava-v1.6-vicuna-13b-hf":
-            args.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
-
-    image_paths = args.image_path
-    image_paths_len = len(image_paths)
-
-    if args.batch_size > image_paths_len:
-        # Dynamically extends to support larger batch sizes
-        num_path_to_add = args.batch_size - image_paths_len
-        for i in range(num_path_to_add):
-            image_paths.append(image_paths[i % image_paths_len])
-    elif args.batch_size < image_paths_len:
-        image_paths = image_paths[: args.batch_size]
-
-    images = []
-
-    for image_path in image_paths:
-        images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw))
-
-    if args.bf16:
-        model_dtype = torch.bfloat16
-    else:
-        model_dtype = torch.float32
-
-    if args.quant_config:
-        import habana_frameworks.torch.core as htcore
-
-        htcore.hpu_set_env()
-
-    generator = pipeline(
-        "image-to-text",
-        model=args.model_name_or_path,
-        torch_dtype=model_dtype,
-        device="hpu",
-    )
-    generate_kwargs = {
-        "lazy_mode": True,
-        "hpu_graphs": args.use_hpu_graphs,
-        "max_new_tokens": args.max_new_tokens,
-        "ignore_eos": True,
-    }
-    if args.use_hpu_graphs:
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        generator.model = wrap_in_hpu_graph(generator.model)
-
-    if args.quant_config:
-        import habana_quantization_toolkit
-
-        habana_quantization_toolkit.prep_model(generator.model)
-
-        htcore.hpu_initialize(generator.model)
-
-    # warm up
-    for i in range(args.warmup):
-        generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
-
-    torch.hpu.synchronize()
-    if args.quant_config:
-        habana_quantization_toolkit.finish_measurements(generator.model)
-
-    start = time.perf_counter()
-    for i in range(args.n_iterations):
-        result = generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
-    end = time.perf_counter()
-    duration = end - start
-
-    total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
-    throughput = total_new_tokens_generated / duration
-    logger.info(
-        f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
-    )
-
-    # Store results if necessary
-    if args.output_dir is not None:
-        output_dir = Path(args.output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        results = {
-            "throughput": throughput,
-            "output": result,
-        }
-        with (output_dir / "results.json").open("w", encoding="utf-8") as f:
-            json.dump(results, f, ensure_ascii=False, indent=4)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
deleted file mode 100644
index 552079432c..0000000000
--- a/examples/language-modeling/README.md
+++ /dev/null
@@ -1,849 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Language Model Training
-
-Fine-tuning (or training from scratch) the library models for language modeling on a text dataset.
-GPT-2 is trained or fine-tuned using a causal language modeling (CLM) loss while ALBERT, BERT, DistilBERT and RoBERTa are trained or fine-tuned using a masked language modeling (MLM) loss. You can find more information about the differences between those objectives in our [model summary](https://huggingface.co/transformers/model_summary.html).
-
-The following examples will run on datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
-text files for training and validation. We give examples of both below.
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## GPT2/GPT-J/GPT-NeoX and causal language modeling
-
-The following examples fine-tune GPT-2, GPT-J-6B and GPT-NeoX-20B on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is the one of causal language modeling.
-
-
-### Single-card Training (GPT2)
-
-```bash
-python run_clm.py \
-    --model_name_or_path gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3
-```
-
-This takes about 13 minutes to train on a single HPU. It reaches
-a perplexity of about 20.9963 once fine-tuned on the dataset.
-
-To run on your own training and validation files, use the following command:
-
-```bash
-python run_clm.py \
-    --model_name_or_path gpt2 \
-    --train_file path_to_train_file \
-    --validation_file path_to_validation_file \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3
-```
-
-
-### Multi-card Training (GPT2)
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_clm.py \
-    --model_name_or_path gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gradient_checkpointing \
-    --use_cache False \
-    --throughput_warmup_steps 3
-```
-
-This takes about 4 minutes to train on 8 HPUs. It reaches
-a perplexity of 21.7968 once fine-tuned on the dataset.
-
-
-### Multi-card Training with Deepspeed (GPT-J)
-
-The following command triggers the fine-tuning of [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) on WikiText-2 with DeepSpeed ZeRO-2.
-Fine tuning on 8 HPU cards takes around 6 minutes with a batch size of 32 (4 per device).
-It reaches a perplexity of 14.011.
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --model_name_or_path EleutherAI/gpt-j-6b \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm-xl-1 \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --gradient_checkpointing \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_for_deepspeed_config
-```
-
-This example has been validated with the following DeepSpeed ZeRO-2 config: https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_2.json
-
-
-## Multi-Node Training with Deepspeed (GPT-NeoX)
-
-The following command triggers the fine-tuning of [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) on WikiText-2 with Deepspeed ZeRO-2.
-Fine-tuning on 16 HPU cards (2 Gaudi2 nodes) takes around 9 minutes with a batch size of 32 (2 per device).
-It reaches a perplexity of 10.469.
-
-> Please refer to [this page](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training) for performing multi-node training properly.
-
-```bash
-python ../gaudi_spawn.py \
-    --hostfile path_to_my_hostfile --use_deepspeed run_clm.py \
-    --model_name_or_path EleutherAI/gpt-neox-20b \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 2\
-    --per_device_eval_batch_size 2 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm-xl-bs2 \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --gradient_checkpointing \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_for_deepspeed_config
-```
-
-This example has been validated with the following DeepSpeed ZeRO-2 config: https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_2.json
-
-
-## RoBERTa/BERT/DistilBERT and masked language modeling
-
-The following examples fine-tune RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their pre-training: masked language modeling.
-Following the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore,
-converge slightly slower (over-fitting takes more epochs).
-
-
-### Single-card Training
-
-```bash
-python run_mlm.py \
-    --model_name_or_path roberta-base \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-mlm \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/roberta-base \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-To run on your own training and validation files, use the following command:
-
-```bash
-python run_mlm.py \
-    --model_name_or_path roberta-base \
-    --train_file path_to_train_file \
-    --validation_file path_to_validation_file \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-mlm \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/roberta-base \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
-concatenates all texts and then splits them into blocks of the same length).
-
-**Note:** On HPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make sure all your batches have the same length.
-
-
-### Multi-card Training
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_mlm.py \
-    --model_name_or_path roberta-base \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 8 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-mlm \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/roberta-base \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-
-### Training in torch.compile mode 
-RoBERTa-Large model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command,  
-a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`.
-b) Run the above commands with `--model_name_or_path roberta-large`, `--use_lazy_mode False` and add `--torch_compile`, `--torch_compile_backend hpu_backend` and remove `--use_hpu_graphs_for_inference` flags.
-
-
-## Pretraining
-
-You can easily train a model from scratch by replacing `--model_name_or_path my_model_name` by `--config_name my_model_name --tokenizer_name my_model_name`.
-
-For example with GPT2:
-```bash
-python run_clm.py \
-    --config_name gpt2 \
-    --tokenizer_name gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-
-## Using DeepSpeed
-
-Multi-card examples can be simply adapted to be run with DeepSpeed. Here is the CLM example with GPT2-XL:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --model_name_or_path gpt2-xl \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 16 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --do_eval \
-    --learning_rate 4e-4 \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gradient_checkpointing \
-    --use_cache False \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-Here is another example with Bloom-7B1:
-
-```bash
-DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 PT_HPU_MAX_COMPOUND_OP_SYNC=1 PT_HPU_MAX_COMPOUND_OP_SIZE=1 python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_clm.py \
-    --model_name_or_path bigscience/bloom-7b1 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 8 \
-    --do_train \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/roberta-base \
-    --use_habana \
-    --use_lazy_mode \
-    --gradient_checkpointing \
-    --use_cache False \
-    --throughput_warmup_steps 3 \
-    --save_strategy "no" \
-    --learning_rate 1e-04 \
-    --deepspeed path_to_my_deepspeed_config
-```
-[This](https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_3_gaudi1.json) is a DeepSpeed configuration you can use to train this model on Gaudi1.
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with GPT2 on the Wikitext dataset on 1 Gaudi card with the following command:
-```bash
-python run_clm.py \
-    --model_name_or_path gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_eval_batch_size 4 \
-    --do_eval \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --bf16
-```
-
-
-## PEFT
-
-### LORA/ADALORA/IA3
-
-To run LoRA finetuning, you can use `run_lora_clm.py`.
-Here are single-/multi-device command examples for Llama1-7B, Falcon-40B, Llama2-70B, Llama3-8B and Llama3-70B.
-You can also use multicard version for Falcon-180B:
-
-- Single-card finetuning of Llama1-7B:
-```bash
-python3 run_lora_clm.py \
-    --model_name_or_path huggyllama/llama-7b \
-    --dataset_name tatsu-lab/alpaca \
-    --bf16 True \
-    --output_dir ./model_lora_llama \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 16 \
-    --evaluation_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 1e-4 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --max_grad_norm  0.3 \
-    --logging_steps 1 \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --lora_rank=8 \
-    --lora_alpha=16 \
-    --lora_dropout=0.05 \
-    --lora_target_modules "q_proj" "v_proj" \
-    --dataset_concatenation \
-    --max_seq_length 512 \
-    --low_cpu_mem_usage True \
-    --validation_split_percentage 4 \
-    --adam_epsilon 1e-08
-```
-- Single-card finetuning of Falcon-40B:
-```bash
-LOWER_LIST=ops_bf16.txt python3 run_lora_clm.py \
-    --model_name_or_path tiiuae/falcon-40b \
-    --dataset_name timdettmers/openassistant-guanaco \
-    --bf16 True \
-    --output_dir ./model_lora_falcon \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --evaluation_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 3e-4 \
-    --max_grad_norm  0.3 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --logging_steps 1 \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --pipelining_fwd_bwd \
-    --throughput_warmup_steps 3 \
-    --lora_rank=64 \
-    --lora_alpha=16 \
-    --lora_dropout=0.1 \
-    --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \
-    --dataset_concatenation \
-    --max_seq_length 256 \
-    --low_cpu_mem_usage True \
-    --adam_epsilon 1e-08 \
-    --do_eval \
-    --validation_split_percentage 5
-```
-
-- Multi-card finetuning of Llama1-7B:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path huggyllama/llama-7b \
-    --dataset_name tatsu-lab/alpaca \
-    --bf16 True \
-    --output_dir ./model_lora_llama_ddp \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 8 \
-    --gradient_accumulation_steps 2 \
-    --evaluation_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 3e-4 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --max_grad_norm  0.3 \
-    --logging_steps 1 \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --lora_rank=8 \
-    --lora_alpha=16 \
-    --lora_dropout=0.05 \
-    --lora_target_modules "q_proj" "v_proj" \
-    --dataset_concatenation \
-    --max_seq_length 512 \
-    --ddp_bucket_cap_mb 50 \
-    --adam_epsilon 1e-08 \
-    --validation_split_percentage 4 \
-    --low_cpu_mem_usage True
-```
-
-- Multi-card finetuning of Llama2-7B with FP8:
-```bash
-LOWER_LIST=ops_bf16.txt python ../gaudi_spawn.py \
-	--world_size 8 --use_mpi run_lora_clm.py \
-	--model_name_or_path meta-llama/Llama-2-7b-hf \
-	--dataset_name tatsu-lab/alpaca \
-	--bf16 True \
-	--output_dir ./model_lora_llama \
-	--num_train_epochs 3 \
-	--per_device_train_batch_size 16 \
-	--gradient_accumulation_steps 1 \
-	--evaluation_strategy "no" \
-	--save_strategy "no" \
-	--learning_rate 3e-4 \
-	--warmup_ratio 0.03 \
-	--lr_scheduler_type "constant" \
-	--max_grad_norm 0.3 \
-	--logging_steps 20 \
-	--do_train \
-	--do_eval \
-	--use_habana \
-	--use_lazy_mode \
-	--throughput_warmup_steps 18 \
-	--lora_rank=8 \
-	--lora_alpha=16 \
-	--lora_dropout=0.05 \
-	--lora_target_modules "q_proj" "v_proj" \
-	--dataset_concatenation \
-	--max_seq_length 512 \
-	--ddp_bucket_cap_mb 50 \
-	--adam_epsilon 1e-08 \
-	--validation_split_percentage 10 \
-	--low_cpu_mem_usage True \
-	--pipelining_fwd_bwd \
-	--fp8 True
-```
-
-- Multi-card finetuning of codegen-16B-mono:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path Salesforce/codegen-16B-mono \
-    --dataset_name b-mc2/sql-create-context \
-    --sql_prompt \
-    --bf16 True \
-    --output_dir ./finetuned-models/codegen-finetune-on-sql-create-context-hpu8-lora8-bs4 \
-    --num_train_epochs 5 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --evaluation_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 1e-4 \
-    --logging_steps 1 \
-    --dataset_concatenation \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --throughput_warmup_steps 3 \
-    --use_hpu_graphs_for_inference \
-    --lora_target_modules "qkv_proj" \
-    --lora_rank 8 \
-    --do_eval \
-    --validation_split_percentage 10 \
-    --use_cache False
-```
-
-- Multi-card finetuning of Falcon-40B:
-```bash
-LOWER_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_lora_clm.py \
-    --model_name_or_path tiiuae/falcon-40b \
-    --dataset_name timdettmers/openassistant-guanaco \
-    --bf16 True \
-    --output_dir ./model_lora_falcon_ddp \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --evaluation_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 4e-4 \
-    --max_grad_norm  0.3 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --logging_steps 1 \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --pipelining_fwd_bwd \
-    --throughput_warmup_steps 3 \
-    --lora_rank=64 \
-    --lora_alpha=16 \
-    --lora_dropout=0.1 \
-    --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \
-    --dataset_concatenation \
-    --max_seq_length 256 \
-    --ddp_bucket_cap_mb 50 \
-    --adam_epsilon 1e-08 \
-    --do_eval \
-    --low_cpu_mem_usage True \
-    --validation_split_percentage 6
-```
-
-- Multi-card finetuning of Llama2-70B with DeepSpeed ZeRO-3 optimization and LoRA:
-
-  > The following command requires Habana DeepSpeed 1.13.0 or later.
-
-```bash
-PT_HPU_MAX_COMPOUND_OP_SIZE=10 DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 \
-python3 ../gaudi_spawn.py --use_deepspeed  --world_size 8  run_lora_clm.py \
-  --model_name_or_path meta-llama/Llama-2-70b-hf \
-  --deepspeed llama2_ds_zero3_config.json \
-  --dataset_name tatsu-lab/alpaca \
-  --bf16 True \
-  --output_dir ./lora_out \
-  --num_train_epochs 2 \
-  --max_seq_len 2048 \
-  --per_device_train_batch_size 10 \
-  --per_device_eval_batch_size 10 \
-  --gradient_checkpointing \
-  --evaluation_strategy epoch \
-  --eval_delay 2 \
-  --save_strategy no \
-  --learning_rate 0.0018 \
-  --warmup_ratio 0.03 \
-  --lr_scheduler_type "cosine" \
-  --logging_steps 1 \
-  --dataset_concatenation \
-  --attn_softmax_bf16 True \
-  --do_train \
-  --do_eval \
-  --use_habana \
-  --use_lazy_mode \
-  --pipelining_fwd_bwd \
-  --throughput_warmup_steps 3 \
-  --lora_rank 4 \
-  --lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \
-  --validation_split_percentage 4 \
-  --use_flash_attention True \
-  --flash_attention_causal_mask True
-```
-
-- Multi-card finetuning of Llama2-70B with DeepSpeed ZeRO-3 optimization, LoRA and FP8 precision:
-
-  > The following command requires Habana DeepSpeed 1.13.0 or later.
-
-```bash
-PT_HPU_MAX_COMPOUND_OP_SIZE=10 \
-python3 ../gaudi_spawn.py --use_deepspeed  --world_size 8  run_lora_clm.py \
-  --model_name_or_path meta-llama/Llama-2-70b-hf \
-  --deepspeed llama2_ds_zero3_config.json \
-  --dataset_name tatsu-lab/alpaca \
-  --bf16 True \
-  --output_dir ./lora_out \
-  --num_train_epochs 2 \
-  --max_seq_len 2048 \
-  --per_device_train_batch_size 10 \
-  --per_device_eval_batch_size 1 \
-  --gradient_checkpointing \
-  --evaluation_strategy epoch \
-  --eval_delay 2 \
-  --save_strategy no \
-  --learning_rate 0.0018 \
-  --warmup_ratio 0.03 \
-  --lr_scheduler_type "cosine" \
-  --logging_steps 1 \
-  --dataset_concatenation \
-  --attn_softmax_bf16 True \
-  --do_train \
-  --do_eval \
-  --use_habana \
-  --use_lazy_mode \
-  --pipelining_fwd_bwd \
-  --throughput_warmup_steps 3 \
-  --lora_rank 4 \
-  --lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \
-  --validation_split_percentage 4 \
-  --use_flash_attention True \
-  --flash_attention_causal_mask True \
-  --fp8 True
-```
-
-- Multi-card finetuning of Llama2-70B with FSDP and LoRA:
-
-```bash
-LOWER_LIST=ops_bf16.txt PT_HPU_LAZY_MODE=0 \
-python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \
-  --model_name_or_path meta-llama/Llama-2-70b-hf \
-  --dataset_name tatsu-lab/alpaca \
-  --bf16 True \
-  --output_dir ./lora_out \
-  --max_seq_len 2048 \
-  --gradient_checkpointing \
-  --per_device_train_batch_size 5 \
-  --save_strategy no \
-  --learning_rate 0.0004 \
-  --warmup_ratio 0.03 \
-  --lr_scheduler_type "constant" \
-  --logging_steps 1 \
-  --dataset_concatenation \
-  --do_train \
-  --use_habana \
-  --throughput_warmup_steps 3 \
-  --lora_rank 4 \
-  --lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \
-  --attn_softmax_bf16 True \
-  --validation_split_percentage 4 \
-  --use_lazy_mode False \
-  --fsdp_config fsdp_config.json \
-  --fsdp auto_wrap \
-  --num_train_epochs 2 \
-  --evaluation_strategy epoch \
-  --per_device_eval_batch_size 1 \
-  --eval_delay 2 \
-  --do_eval \
-  --pipelining_fwd_bwd False \
-  --use_fused_rope False \
-  --torch_compile_backend hpu_backend \
-  --torch_compile \
-  --gradient_accumulation_steps 2 \
-  --use_flash_attention True \
-  --flash_attention_causal_mask True
-```
-
-- Multi-card finetuning of Falcon-180B:
-  - Falcon-180B example command saves only the LoRA parameters at end
-  - For inference we need to merge the pretrained model and LoRA weights
-```bash
-DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 LOWER_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_lora_clm.py \
-    --model_name_or_path tiiuae/falcon-180B \
-    --dataset_name timdettmers/openassistant-guanaco \
-    --bf16 True \
-    --output_dir ./model_lora_falcon_ddp \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --evaluation_strategy "no" \
-    --save_strategy "no" \
-    --learning_rate 4e-4 \
-    --max_grad_norm  0.3 \
-    --warmup_ratio  0.03 \
-    --lr_scheduler_type "constant" \
-    --logging_steps 1 \
-    --do_train \
-    --use_habana \
-    --use_lazy_mode \
-    --pipelining_fwd_bwd \
-    --throughput_warmup_steps 3 \
-    --lora_rank=64 \
-    --lora_alpha=16 \
-    --lora_dropout=0.1 \
-    --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \
-    --dataset_concatenation \
-    --max_seq_length 256 \
-    --adam_epsilon 1e-08 \
-    --do_eval \
-    --validation_split_percentage 5 \
-    --deepspeed ds_falcon_180b_z3.json
-```
-Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`.
-
-### Prompt/Prefix/P-tuning
-
-To run prompt tuning finetuning, you can use `run_prompt_tuning_clm.py`.
-Here are single-/multi-device command examples for Llama2-7B:
-- single-card finetuning of meta-llama/Llama-2-7b-hf with dataset "ought/raft" and config "twitter_complaints":
-```bash
-python3 run_prompt_tuning_clm.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --output_dir prompt_tuning_out \
-    --bf16 True \
-    --report_to=none \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --low_cpu_mem_usage True \
-    --logging_steps 1 \
-    --do_train \
-    --num_train_epochs 50 \
-    --do_eval  \
-    --use_habana  \
-    --use_lazy_mode
-```
-
-- multi-card finetuning of meta-llama/Llama-2-7b-hf with dataset "ought/raft" and config "twitter_complaints":
-```bash
-python3 ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_prompt_tuning_clm.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --output_dir prompt_tuning_out \
-    --bf16 True \
-    --report_to=none \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --low_cpu_mem_usage True \
-    --logging_steps 1 \
-    --do_train \
-    --num_train_epochs 50 \
-    --do_eval  \
-    --use_habana  \
-    --use_lazy_mode
-```
-Default `peft_type` is `prompt_tuning`, you could enable prefix-tuning or p-tuning using `--peft_type prefix_tuning` or `--peft_type p_tuning`.
-
-Use the prompt finetuned model for text-generation:
-```bash
-python3 ../text-generation/run_generation.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf  \
-    --max_new_tokens 128 \
-    --bf16 \
-    --use_kv_cache \
-    --batch_size 1 \
-    --use_hpu_graphs \
-    --ignore_eos \
-    --peft_model prompt_tuning_out \
-    --prompt "@SEPTA_SOCIAL Ok. Thanks. Label :"
-
-```
-
-## Streaming
-
-To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` with `--max_steps` specified in the command line. This is supported by `run_mlm.py` and `run_clm.py`.
-
-For example:
-```bash
-python run_clm.py \
-    --model_name_or_path gpt2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --do_train \
-    --output_dir /tmp/test-clm \
-    --gaudi_config_name Habana/gpt2 \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --streaming \
-    --max_steps 1000 \
-    --do_eval
-```
-
-
-## Creating a model on the fly
-
-When training a model from scratch, configuration values may be overridden with the help of `--config_overrides`:
-
-```bash
-python run_clm.py \
-    --model_type gpt2 \
-    --tokenizer_name gpt2 \
-    --config_overrides="n_embd=1024,n_head=16,n_layer=48,n_positions=1024" \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --per_device_train_batch_size 2 \
-    --per_device_eval_batch_size 2 \
-    --do_train \
-    --do_eval \
-    --gradient_checkpointing \
-    --use_cache False \
-    --output_dir /tmp/test-clm \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/gpt2 \
-    --throughput_warmup_steps 3
-```
-
-<!-- This feature is only available in `run_clm.py` and `run_mlm.py`. -->
-
-
-## Low Cpu Memory Usage
-
-To use low cpu memory mode which can be very useful for LLM, add `--low_cpu_mem_usage` to the command line.
diff --git a/examples/language-modeling/ds_falcon_180b_z3.json b/examples/language-modeling/ds_falcon_180b_z3.json
deleted file mode 100644
index c0c045b088..0000000000
--- a/examples/language-modeling/ds_falcon_180b_z3.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "bf16": {
-    "enabled": "auto"
-  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
-  "zero_optimization": {
-    "stage": 3,
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 1e9,
-    "reduce_bucket_size": "auto",
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 1e9,
-    "stage3_max_reuse_distance": 1e9,
-    "stage3_gather_16bit_weights_on_model_save": true
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "steps_per_print": 1,
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
diff --git a/examples/language-modeling/fsdp_config.json b/examples/language-modeling/fsdp_config.json
deleted file mode 100644
index 4aae21af2f..0000000000
--- a/examples/language-modeling/fsdp_config.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-    "fsdp_backward_prefetch": "BACKWARD_PRE",
-    "fsdp_forward_prefetch": false,
-    "fsdp_offload_params": false,
-    "fsdp_sharding_strategy": 1,
-    "fsdp_state_dict_type": "FULL_STATE_DICT",
-    "fsdp_sync_module_states": true,
-    "fsdp_use_orig_params": true,
-    "transformer_layer_cls_to_wrap": "GaudiLlamaDecoderLayer",
-    "fsdp_activation_checkpointing": false
-}
diff --git a/examples/language-modeling/llama2_ds_zero3_config.json b/examples/language-modeling/llama2_ds_zero3_config.json
deleted file mode 100755
index 2d64cbd0b3..0000000000
--- a/examples/language-modeling/llama2_ds_zero3_config.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 3,
-        "overlap_comm": false,
-        "contiguous_gradients": false,
-        "stage3_gather_16bit_weights_on_model_save": true
-    }
-}
diff --git a/examples/language-modeling/ops_bf16.txt b/examples/language-modeling/ops_bf16.txt
deleted file mode 100644
index 0fb7df6cdb..0000000000
--- a/examples/language-modeling/ops_bf16.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-addmm
-addbmm
-batch_norm
-baddbmm
-bmm
-conv1d
-conv2d
-conv3d
-conv_transpose1d
-conv_transpose2d
-conv_transpose3d
-dot
-dropout
-feature_dropout
-group_norm
-instance_norm
-layer_norm
-leaky_relu
-linear
-matmul
-mean
-mm
-mul
-mv
-softmax
-log_softmax
-sin
-cos
-add
-div
-gather
-embedding
diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt
deleted file mode 100644
index 73d9328bb5..0000000000
--- a/examples/language-modeling/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-torch >= 1.3, <= 2.4.0
-datasets >= 2.14.0, <= 2.19.2
-sentencepiece != 0.1.92
-protobuf == 3.20.3
-evaluate == 0.4.1
-scikit-learn == 1.4.1.post1
-peft == 0.10.0
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
deleted file mode 100644
index fb00e93fb2..0000000000
--- a/examples/language-modeling/run_clm.py
+++ /dev/null
@@ -1,692 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Training the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
-Here is the full list of checkpoints on the hub that can be trained by this script:
-https://huggingface.co/models?filter=text-generation
-"""
-# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
-
-import logging
-import math
-import os
-import sys
-from dataclasses import dataclass, field
-from itertools import chain
-from typing import Optional
-
-import datasets
-import evaluate
-import torch
-import transformers
-from datasets import load_dataset
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    HfArgumentParser,
-    default_data_collator,
-)
-from transformers.testing_utils import CaptureLogger
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-
-
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Don't set it if you want to train a model from"
-                " scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_overrides: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override some existing default config settings when a model is trained from scratch. Example: "
-                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
-            )
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    torch_dtype: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
-                "dtype will be automatically derived from the model's weights."
-            ),
-            "choices": ["auto", "bfloat16", "float16", "float32"],
-        },
-    )
-    use_cache: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether or not the model should return the last key/values attentions (not used by all models)."
-                "Only relevant if `config.is_decoder=True`."
-            )
-        },
-    )
-    low_cpu_mem_usage: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
-                "Setting it to True will benefit LLM loading time and RAM consumption."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
-            raise ValueError(
-                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
-            )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-
-    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode."})
-    block_size: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Optional input sequence length after tokenization. "
-                "The training dataset will be truncated in block of this size for training. "
-                "Default to the model max input length for single sentence inputs (take into account special tokens)."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    keep_linebreaks: bool = field(
-        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
-    )
-    save_last_ckpt: bool = field(
-        default=True, metadata={"help": "Whether to save checkpoint at the end of the training."}
-    )
-
-    def __post_init__(self):
-        if self.streaming:
-            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
-
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_clm", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-            streaming=data_args.streaming,
-        )
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                streaming=data_args.streaming,
-            )
-            raw_datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                streaming=data_args.streaming,
-            )
-    else:
-        data_files = {}
-        dataset_args = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-        extension = (
-            data_args.train_file.split(".")[-1]
-            if data_args.train_file is not None
-            else data_args.validation_file.split(".")[-1]
-        )
-        if extension == "txt":
-            extension = "text"
-            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-            **dataset_args,
-        )
-        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                **dataset_args,
-            )
-            raw_datasets["train"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                **dataset_args,
-            )
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "revision": model_args.model_revision,
-        "token": model_args.token,
-        "trust_remote_code": model_args.trust_remote_code,
-        "use_cache": False if training_args.gradient_checkpointing else model_args.use_cache,
-    }
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
-    else:
-        config = CONFIG_MAPPING[model_args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-        if model_args.config_overrides is not None:
-            logger.info(f"Overriding config: {model_args.config_overrides}")
-            config.update_from_string(model_args.config_overrides)
-            logger.info(f"New config: {config}")
-
-    tokenizer_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "use_fast": model_args.use_fast_tokenizer,
-        "revision": model_args.model_revision,
-        "token": model_args.token,
-        "trust_remote_code": model_args.trust_remote_code,
-    }
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if model_args.model_name_or_path:
-        torch_dtype = (
-            model_args.torch_dtype
-            if model_args.torch_dtype in ["auto", None]
-            else getattr(torch, model_args.torch_dtype)
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            token=model_args.token,
-            trust_remote_code=model_args.trust_remote_code,
-            torch_dtype=torch_dtype,
-            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
-        )
-    else:
-        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
-        n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
-
-    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
-    # on a small vocab and want a smaller embedding size, remove this test.
-    embedding_size = model.get_input_embeddings().weight.shape[0]
-    if len(tokenizer) > embedding_size:
-        model.resize_token_embeddings(len(tokenizer))
-
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    if training_args.do_train:
-        column_names = list(raw_datasets["train"].features)
-    else:
-        column_names = list(raw_datasets["validation"].features)
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
-    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
-
-    def tokenize_function(examples):
-        with CaptureLogger(tok_logger) as cl:
-            output = tokenizer(examples[text_column_name])
-        # clm input could be much much longer than block_size
-        if "Token indices sequence length is longer than the" in cl.out:
-            tok_logger.warning(
-                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
-                " before being passed to the model."
-            )
-        return output
-
-    with training_args.main_process_first(desc="dataset map tokenization"):
-        if not data_args.streaming:
-            tokenized_datasets = raw_datasets.map(
-                tokenize_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on dataset",
-            )
-        else:
-            tokenized_datasets = raw_datasets.map(
-                tokenize_function,
-                batched=True,
-                remove_columns=column_names,
-            )
-
-    if hasattr(config, "max_position_embeddings"):
-        max_pos_embeddings = config.max_position_embeddings
-    else:
-        # Define a default value if the attribute is missing in the config.
-        max_pos_embeddings = 1024
-
-    if data_args.block_size is None:
-        block_size = tokenizer.model_max_length
-        if block_size > max_pos_embeddings:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
-            )
-            if max_pos_embeddings > 0:
-                block_size = min(1024, max_pos_embeddings)
-            else:
-                block_size = 1024
-    else:
-        if data_args.block_size > tokenizer.model_max_length:
-            logger.warning(
-                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model "
-                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
-            )
-        block_size = min(data_args.block_size, tokenizer.model_max_length)
-
-    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
-    def group_texts(examples):
-        # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
-        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-        total_length = (total_length // block_size) * block_size
-        # Split by chunks of max_len.
-        result = {
-            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-            for k, t in concatenated_examples.items()
-        }
-        result["labels"] = result["input_ids"].copy()
-        return result
-
-    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
-    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
-    # to preprocess.
-    #
-    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/process#map
-
-    with training_args.main_process_first(desc="grouping texts together"):
-        if not data_args.streaming:
-            lm_datasets = tokenized_datasets.map(
-                group_texts,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc=f"Grouping texts in chunks of {block_size}",
-            )
-        else:
-            lm_datasets = tokenized_datasets.map(
-                group_texts,
-                batched=True,
-            )
-
-    if training_args.do_train:
-
-        def tensor_mapper(x):
-            return {i: torch.tensor(x[i], dtype=torch.int32) for i in x}
-
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = lm_datasets["train"]
-        if training_args.resume_from_checkpoint is not None and training_args.resume_from_checkpoint != "":
-            train_dataset = train_dataset.map(tensor_mapper)
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = lm_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-        def preprocess_logits_for_metrics(logits, labels):
-            if isinstance(logits, tuple):
-                # Depending on the model and config, logits may contain extra tensors,
-                # like past_key_values, but logits always come first
-                logits = logits[0]
-            return logits.argmax(dim=-1)
-
-        metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
-
-        def compute_metrics(eval_preds):
-            preds, labels = eval_preds
-            # preds have the same shape as the labels, after the argmax(-1) has been calculated
-            # by preprocess_logits_for_metrics but we need to shift the labels
-            labels = labels[:, 1:].reshape(-1)
-            preds = preds[:, :-1].reshape(-1)
-            return metric.compute(predictions=preds, references=labels)
-
-    # Initialize our Trainer
-    trainer = GaudiTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        # Data collator will default to DataCollatorWithPadding, so we change it.
-        data_collator=default_data_collator,
-        compute_metrics=compute_metrics if training_args.do_eval else None,
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        if data_args.save_last_ckpt:
-            trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-
-        if data_args.streaming:
-            metrics["train_samples"] = training_args.max_steps * training_args.per_device_train_batch_size
-        else:
-            max_train_samples = (
-                data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-            )
-            metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-
-        if not data_args.streaming:
-            max_eval_samples = (
-                data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-            )
-            metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        try:
-            perplexity = math.exp(metrics["eval_loss"])
-        except OverflowError:
-            perplexity = float("inf")
-        metrics["perplexity"] = perplexity
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
deleted file mode 100644
index 9998e20cb9..0000000000
--- a/examples/language-modeling/run_lora_clm.py
+++ /dev/null
@@ -1,816 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-# Apache v2 license
-# Copyright (C) 2022 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import logging
-import math
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-import datasets
-import evaluate
-import torch
-import transformers
-from datasets import load_dataset
-from peft import AdaLoraConfig, IA3Config, LoraConfig, TaskType, get_peft_model, tuners
-from peft.utils.other import fsdp_auto_wrap_policy
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    DataCollatorForLanguageModeling,
-    HfArgumentParser,
-)
-from transformers.trainer_utils import is_main_process
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import is_gaudi3, set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-IGNORE_INDEX = -100
-
-os.environ["WANDB_DISABLED"] = "true"
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.10.0")
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The model checkpoint for weights initialization."
-            "Don't set if you want to train a model from scratch."
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained config name or path if not the same as model_name"},
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    token: Optional[str] = field(
-        default=None,
-        metadata={"help": "auth token for private models"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": "should enable when using custom model architecture that is not yet part of the Hugging Face transformers package like MPT)."
-        },
-    )
-    use_cache: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether or not the model should return the last key/values attentions (not used by all models)."
-                "Only relevant if `config.is_decoder=True`."
-            )
-        },
-    )
-    low_cpu_mem_usage: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
-                "When set to True, it will benefit LLM loading time and RAM consumption."
-            )
-        },
-    )
-    attn_softmax_bf16: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to run attention softmax layer in bf16 precision for fine-tuning. The current support is limited to Llama only."
-            )
-        },
-    )
-    use_flash_attention: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to use Habana flash attention for fine-tuning. The current support is limited to Llama only."
-            )
-        },
-    )
-    flash_attention_recompute: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to enable recompute in Habana flash attention for fine-tuning."
-                " It is applicable only when use_flash_attention is True."
-            )
-        },
-    )
-    flash_attention_causal_mask: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to enable causal mask in Habana flash attention for fine-tuning."
-                " It is applicable only when use_flash_attention is True."
-            )
-        },
-    )
-    flash_attention_fp8: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to enable flash attention in FP8."
-            )
-        },
-    )
-    use_fused_rope: bool = field(
-        default=True,
-        metadata={
-            "help": ("Whether to use Habana fused-rope for fine-tuning. The current support is limited to Llama only.")
-        },
-    )
-    load_meta_device: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "It is an option to load the model to the device instead of the host, so it can reduce the host RAM usage."
-                "https://huggingface.co/blog/accelerate-large-models"
-            )
-        },
-    )
-
-
-@dataclass
-class DataArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the dataset to use (via the datasets library)."},
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."},
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    max_seq_length: Optional[int] = field(
-        default=512,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated."
-        },
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=0,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False,
-        metadata={"help": "Overwrite the cached preprocessed datasets or not."},
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to pad all samples to `max_seq_length`. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-            "value if set."
-        },
-    )
-    keep_in_memory: bool = field(
-        default=False,
-        metadata={"help": "Whether to keep in memory the loaded dataset. Defaults to False."},
-    )
-    dataset_seed: int = field(
-        default=42,
-        metadata={
-            "help": "Seed to use in dataset processing, different seeds might yield different datasets. This seed and the seed in training arguments are not related"
-        },
-    )
-    dataset_cache_directory: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Path to directory where the processed dataset will be saved. If path exists, try to load processed dataset from this path."
-        },
-    )
-    dataset_concatenation: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to concatenate the sentence for more efficient training."},
-    )
-    sql_prompt: bool = field(
-        default=False,
-        metadata={"help": "Whether to have a SQL style prompt"},
-    )
-    save_last_ckpt: bool = field(
-        default=True, metadata={"help": "Whether to save checkpoint at the end of the training."}
-    )
-
-
-@dataclass
-class FinetuneArguments:
-    """
-    Arguments of finetune we are going to apply on the model.
-    """
-
-    lora_rank: int = field(
-        default=8,
-        metadata={"help": "Rank parameter in the LoRA method."},
-    )
-    lora_alpha: int = field(
-        default=16,
-        metadata={"help": "Alpha parameter in the LoRA method."},
-    )
-    lora_dropout: float = field(
-        default=0.05,
-        metadata={"help": "Dropout parameter in the LoRA method."},
-    )
-    lora_target_modules: List[str] = field(
-        default_factory=lambda: None,
-        metadata={"help": "Target modules for the LoRA/AdaLoRA method."},
-    )
-    train_on_inputs: bool = field(
-        default=True,
-        metadata={"help": "if False, masks out inputs in loss"},
-    )
-    adalora_init_r: int = field(
-        default=12,
-        metadata={"help": "Initial AdaLoRA rank"},
-    )
-    adalora_target_r: int = field(
-        default=4,
-        metadata={"help": "Target AdaLoRA rank"},
-    )
-    adalora_tinit: int = field(
-        default=50,
-        metadata={"help": "Number of warmup steps for AdaLoRA wherein no pruning is performed"},
-    )
-    adalora_tfinal: int = field(
-        default=100,
-        metadata={
-            "help": "Fix the resulting budget distribution and fine-tune the model for tfinal steps when using AdaLoRA"
-        },
-    )
-    adalora_delta_t: int = field(
-        default=10,
-        metadata={"help": "Interval of steps for AdaLoRA to update rank"},
-    )
-    adalora_orth_reg_weight: float = field(
-        default=0.5,
-        metadata={"help": "Orthogonal regularization weight for AdaLoRA"},
-    )
-    peft_type: str = field(
-        default="lora",
-        metadata={
-            "help": ("The PEFT type to use."),
-            "choices": ["lora", "ia3", "adalora"],
-        },
-    )
-    ia3_target_modules: List[str] = field(
-        default_factory=lambda: None,
-        metadata={"help": "Target modules for the IA3 method."},
-    )
-    feedforward_modules: List[str] = field(
-        default_factory=lambda: None,
-        metadata={"help": "Target feedforward modules for the IA3 method."},
-    )
-
-
-PROMPT_DICT = {
-    "prompt_with_input": (
-        "Below is an instruction that describes a task, paired with an input that provides further context. "
-        "Write a response that appropriately completes the request.\n\n"
-        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
-    ),
-    "prompt_without_input": (
-        "Below is an instruction that describes a task. "
-        "Write a response that appropriately completes the request.\n\n"
-        "### Instruction:\n{instruction}\n\n### Response:"
-    ),
-}
-
-SQL_PROMPT = (
-    "You are a text-to-SQL model. Your job is to answer questions about a database. "
-    "You are given a question and a context regarding one or more tables in the database.\n\n"
-    "You must output the SQL query that answers the question. The SQL query must be between [SQL] and [/SQL] tags.\n\n"
-    "### Question: \n{question}\n\n### Context: \n{context}\n\n### Response:"
-)
-
-
-def create_prompts(examples):
-    prompts = {}
-    prompts["source"] = []
-    prompts["target"] = []
-    for example in examples:
-        prompt_template = (
-            PROMPT_DICT["prompt_with_input"] if example["input"] != "" else PROMPT_DICT["prompt_without_input"]
-        )
-        source = prompt_template.format_map(example)
-        prompts["source"].append(source)
-        prompts["target"].append(example["output"])
-    return prompts
-
-
-def create_sql_prompts(examples):
-    prompts = {}
-    prompts["source"] = []
-    prompts["target"] = []
-    for example in examples:
-        source = SQL_PROMPT.format_map(example)
-        prompts["source"].append(source)
-        prompts["target"].append(example["answer"])
-    return prompts
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataArguments, GaudiTrainingArguments, FinetuneArguments))
-
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args, finetune_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
-    else:
-        (
-            model_args,
-            data_args,
-            training_args,
-            finetune_args,
-        ) = parser.parse_args_into_dataclasses()
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary
-    b16 = training_args.fp16 or training_args.bf16
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {b16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "revision": model_args.model_revision,
-        "trust_remote_code": True if model_args.trust_remote_code else None,
-        "use_cache": False if training_args.gradient_checkpointing else model_args.use_cache,
-        "token": model_args.token,
-        "flash_attention_fp8": model_args.flash_attention_fp8,
-    }
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
-    else:
-        raise ValueError("Please provide value for model_name_or_path or config_name.")
-
-    tokenizer_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "use_fast": model_args.use_fast_tokenizer,
-        "revision": model_args.model_revision,
-        "token": model_args.token,
-    }
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-
-        if "validation" not in raw_datasets.keys() and training_args.do_eval:
-            raw_datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-            )
-            raw_datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-            )
-    else:
-        data_files = {}
-        dataset_args = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-        extension = (
-            data_args.train_file.split(".")[-1]
-            if data_args.train_file is not None
-            else data_args.validation_file.split(".")[-1]
-        )
-        if extension == "txt":
-            extension = "text"
-            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-            **dataset_args,
-        )
-
-        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys() and training_args.do_eval:
-            raw_datasets["validation"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                **dataset_args,
-            )
-            raw_datasets["train"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                **dataset_args,
-            )
-
-    if data_args.dataset_name == "tatsu-lab/alpaca" or data_args.sql_prompt:
-        # Preprocessing the datasets.
-        for key in raw_datasets:
-            prompts = (
-                create_prompts(raw_datasets[key])
-                if not data_args.sql_prompt
-                else create_sql_prompts(raw_datasets[key])
-            )
-            columns_to_be_removed = list(raw_datasets[key].features.keys())
-            raw_datasets[key] = raw_datasets[key].add_column("prompt_sources", prompts["source"])
-            raw_datasets[key] = raw_datasets[key].add_column("prompt_targets", prompts["target"])
-            raw_datasets[key] = raw_datasets[key].remove_columns(columns_to_be_removed)
-    elif (
-        data_args.dataset_name == "timdettmers/openassistant-guanaco"
-    ):  # from https://github.com/artidoro/qlora/blob/main/qlora.py#L621
-        raw_datasets = raw_datasets.map(
-            lambda x: {
-                "input": "",
-                "output": x["text"],
-            }
-        )
-        # Remove unused columns.
-        raw_datasets = raw_datasets.remove_columns(
-            [col for col in raw_datasets.column_names["train"] if col not in ["input", "output"]]
-        )
-    else:
-        raise ValueError("Unsupported dataset")
-    # Load model
-    if model_args.model_name_or_path:
-        model_dtype = torch.bfloat16 if training_args.bf16 else None
-        model = AutoModelForCausalLM.from_pretrained(
-            model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            trust_remote_code=True if model_args.trust_remote_code else None,
-            torch_dtype=model_dtype,
-            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
-            device_map=training_args.device.type if model_args.load_meta_device else None,
-            token=model_args.token,
-        )
-    else:
-        raise ValueError("Must provide model_name_or_path to load a pretrained CausalLM model.")
-
-    if model.config.model_type == "llama":
-        # unwind broken decapoda-research config
-        model.generation_config.pad_token_id = 0
-        model.generation_config.bos_token_id = 1
-        model.generation_config.eos_token_id = 2
-        if model_args.attn_softmax_bf16:
-            model.generation_config.attn_softmax_bf16 = True
-        if model_args.use_flash_attention:
-            model.generation_config.use_flash_attention = True
-            model.generation_config.flash_attention_recompute = model_args.flash_attention_recompute
-            model.generation_config.flash_attention_causal_mask = model_args.flash_attention_causal_mask
-
-            if model_args.flash_attention_fp8:
-                assert is_gaudi3(), "Flash attention in FP8 is supported only on Gaudi3"
-        if not model_args.use_fused_rope:
-            model.generation_config.use_fused_rope = False
-
-    if hasattr(model.generation_config, "pad_token_id") and model.generation_config.pad_token_id is not None:
-        tokenizer.pad_token_id = model.generation_config.pad_token_id
-    if hasattr(model.generation_config, "eos_token_id") and model.generation_config.eos_token_id is not None:
-        tokenizer.eos_token_id = model.generation_config.eos_token_id
-    if hasattr(model.generation_config, "bos_token_id") and model.generation_config.bos_token_id is not None:
-        tokenizer.bos_token_id = model.generation_config.bos_token_id
-
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-
-    def tokenize(prompt, add_eos_token=True):
-        results = tokenizer(
-            prompt,
-            truncation=True,
-            max_length=data_args.max_seq_length,
-            padding=False,
-            return_tensors=None,
-        )
-        for i in range(len(results["input_ids"])):
-            if (
-                results["input_ids"][i][-1] != tokenizer.eos_token_id
-                and len(results["input_ids"][i]) < data_args.max_seq_length
-                and add_eos_token
-            ):
-                results["input_ids"][i].append(tokenizer.eos_token_id)
-                results["attention_mask"][i].append(1)
-
-        results["labels"] = copy.deepcopy(results["input_ids"])
-        results["input_id_len"] = [len(result) for result in results["input_ids"]]
-        return results
-
-    def preprocess_function(examples):
-        keys = list(examples.data.keys())
-        if len(keys) != 2:
-            raise ValueError("Unsupported dataset format")
-
-        st = [s + t for s, t in zip(examples[keys[0]], examples[keys[1]])]
-
-        examples_tokenized = tokenize(st)
-        input_ids = examples_tokenized["input_ids"]
-        labels = examples_tokenized["labels"]
-        if not finetune_args.train_on_inputs:
-            sources_tokenized = tokenize(examples[keys[0]], add_eos_token=False)
-            for label, source_len in zip(labels, sources_tokenized["input_id_len"]):
-                label[:source_len] = [IGNORE_INDEX] * source_len
-        return {
-            "input_ids": input_ids,
-            "labels": labels,
-            "attention_mask": examples_tokenized["attention_mask"],
-        }
-
-    with training_args.main_process_first(desc="dataset map pre-processing"):
-        tokenized_datasets = raw_datasets.map(
-            preprocess_function,
-            batched=True,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    if data_args.dataset_concatenation:
-
-        def concatenate_data(dataset, max_seq_length):
-            concatenated_dataset = {}
-            for column in dataset.features:
-                concatenated_data = [item for sample in dataset[column] for item in sample]
-                reshaped_data = [
-                    concatenated_data[i * max_seq_length : (i + 1) * max_seq_length]
-                    for i in range(len(concatenated_data) // max_seq_length)
-                ]
-                concatenated_dataset[column] = reshaped_data
-            return datasets.Dataset.from_dict(concatenated_dataset)
-
-        if data_args.dataset_name == "tatsu-lab/alpaca" or data_args.sql_prompt:
-            tokenized_datasets_ = tokenized_datasets["train"].remove_columns(["prompt_sources", "prompt_targets"])
-            if training_args.do_eval:
-                tokenized_datasets_eval_ = tokenized_datasets["validation"].remove_columns(
-                    ["prompt_sources", "prompt_targets"]
-                )
-        elif data_args.dataset_name == "timdettmers/openassistant-guanaco":
-            tokenized_datasets_ = tokenized_datasets["train"].remove_columns(["input", "output"])
-            if training_args.do_eval:
-                tokenized_datasets_eval_ = tokenized_datasets["test"].remove_columns(["input", "output"])
-        else:
-            raise ValueError("Unsupported dataset")
-        tokenized_datasets["train"] = concatenate_data(tokenized_datasets_, data_args.max_seq_length)
-        if training_args.do_eval:
-            tokenized_datasets["validation"] = concatenate_data(tokenized_datasets_eval_, data_args.max_seq_length)
-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = tokenized_datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = tokenized_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-
-        def preprocess_logits_for_metrics(logits, labels):
-            if isinstance(logits, tuple):
-                # Depending on the model and config, logits may contain extra tensors,
-                # like past_key_values, but logits always come first
-                logits = logits[0]
-            return logits.argmax(dim=-1)
-
-        metric = evaluate.load("accuracy")
-
-        def compute_metrics(eval_preds):
-            preds, labels = eval_preds
-            # preds have the same shape as the labels, after the argmax(-1) has been calculated
-            # by preprocess_logits_for_metrics but we need to shift the labels
-            labels = labels[:, 1:].reshape(-1)
-            preds = preds[:, :-1].reshape(-1)
-            return metric.compute(predictions=preds, references=labels)
-
-    # Data collator
-    # This one will take care of randomly masking the tokens.
-    data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="pt", mlm=False)
-    logger.info("Using data collator of type {}".format(data_collator.__class__.__name__))
-
-    if training_args.do_train or training_args.do_eval:
-        # PEFT settings
-        if finetune_args.peft_type == "lora":
-            peft_config = LoraConfig(
-                r=finetune_args.lora_rank,
-                lora_alpha=finetune_args.lora_alpha,
-                lora_dropout=finetune_args.lora_dropout,
-                target_modules=finetune_args.lora_target_modules,
-                bias="none",
-                task_type=TaskType.CAUSAL_LM,
-            )
-        elif finetune_args.peft_type == "adalora":
-            peft_config = AdaLoraConfig(
-                init_r=finetune_args.adalora_init_r,
-                target_r=finetune_args.adalora_target_r,
-                tinit=finetune_args.adalora_tinit,
-                tfinal=finetune_args.adalora_tfinal,
-                deltaT=finetune_args.adalora_delta_t,
-                lora_alpha=finetune_args.lora_alpha,
-                lora_dropout=finetune_args.lora_dropout,
-                target_modules=finetune_args.lora_target_modules,
-                orth_reg_weight=finetune_args.adalora_orth_reg_weight,
-                bias="none",
-                task_type=TaskType.CAUSAL_LM,
-            )
-            from optimum.habana.peft.layer import GaudiAdaloraLayerSVDLinearForward
-
-            tuners.adalora.layer.SVDLinear.forward = GaudiAdaloraLayerSVDLinearForward
-        elif finetune_args.peft_type == "ia3":
-            peft_config = IA3Config(
-                target_modules=finetune_args.ia3_target_modules,
-                feedforward_modules=finetune_args.feedforward_modules,
-                task_type=TaskType.CAUSAL_LM,
-            )
-        if training_args.gradient_checkpointing:
-            model.enable_input_require_grads()
-        lora_model = get_peft_model(model, peft_config)
-        if training_args.bf16 and finetune_args.peft_type != "ia3":
-            lora_model = lora_model.to(torch.bfloat16)
-        lora_model.print_trainable_parameters()
-        gaudi_config = GaudiConfig()
-        gaudi_config.use_fused_adam = True
-        gaudi_config.use_fused_clip_norm = True
-
-        # Initialize our Trainer
-        trainer = GaudiTrainer(
-            model=lora_model,
-            gaudi_config=gaudi_config,
-            args=training_args,
-            train_dataset=train_dataset if training_args.do_train else None,
-            eval_dataset=eval_dataset if training_args.do_eval else None,
-            tokenizer=tokenizer,
-            data_collator=data_collator,
-            compute_metrics=compute_metrics if training_args.do_eval else None,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
-        )
-
-        # Solution for https://github.com/huggingface/peft/blob/v0.6.2/README.md#caveats (1)
-        if training_args.fsdp and training_args.fsdp_config["auto_wrap_policy"] == "TRANSFORMER_BASED_WRAP":
-            trainer.accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(lora_model)
-
-    if training_args.do_train:
-        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-        if data_args.save_last_ckpt:
-            trainer.save_model()
-
-        metrics = train_result.metrics
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-
-        # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        try:
-            perplexity = math.exp(metrics["eval_loss"])
-        except OverflowError:
-            perplexity = float("inf")
-        metrics["perplexity"] = perplexity
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
deleted file mode 100644
index 17c2432760..0000000000
--- a/examples/language-modeling/run_mlm.py
+++ /dev/null
@@ -1,704 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Training the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset.
-Here is the full list of checkpoints on the hub that can be trained by this script:
-https://huggingface.co/models?filter=fill-mask
-"""
-# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
-
-import logging
-import math
-import os
-import sys
-from dataclasses import dataclass, field
-from itertools import chain
-from typing import Optional
-
-import datasets
-import evaluate
-import torch
-import transformers
-from datasets import load_dataset
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_FOR_MASKED_LM_MAPPING,
-    AutoConfig,
-    AutoModelForMaskedLM,
-    AutoTokenizer,
-    DataCollatorForLanguageModeling,
-    HfArgumentParser,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-
-
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_overrides: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override some existing default config settings when a model is trained from scratch. Example: "
-                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
-            )
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    torch_dtype: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
-                "dtype will be automatically derived from the model's weights."
-            ),
-            "choices": ["auto", "bfloat16", "float32"],
-        },
-    )
-    low_cpu_mem_usage: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
-                "Setting it to True will benefit LLM loading time and RAM consumption."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
-            raise ValueError(
-                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
-            )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    max_seq_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated."
-            )
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    mlm_probability: float = field(
-        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
-    )
-    line_by_line: bool = field(
-        default=False,
-        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode."})
-
-    def __post_init__(self):
-        if self.streaming:
-            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
-
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                if extension not in ["csv", "json", "txt"]:
-                    raise ValueError("`train_file` should be a csv, a json or a txt file.")
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                if extension not in ["csv", "json", "txt"]:
-                    raise ValueError("`validation_file` should be a csv, a json or a txt file.")
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_mlm", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
-    # behavior (see below)
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-            streaming=data_args.streaming,
-        )
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                streaming=data_args.streaming,
-            )
-            raw_datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-                streaming=data_args.streaming,
-            )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if extension == "txt":
-            extension = "text"
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-
-        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-            )
-            raw_datasets["train"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-            )
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "revision": model_args.model_revision,
-        "token": model_args.token,
-        "trust_remote_code": model_args.trust_remote_code,
-    }
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
-    else:
-        config = CONFIG_MAPPING[model_args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-        if model_args.config_overrides is not None:
-            logger.info(f"Overriding config: {model_args.config_overrides}")
-            config.update_from_string(model_args.config_overrides)
-            logger.info(f"New config: {config}")
-
-    tokenizer_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "use_fast": model_args.use_fast_tokenizer,
-        "revision": model_args.model_revision,
-        "token": model_args.token,
-        "trust_remote_code": model_args.trust_remote_code,
-    }
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if model_args.model_name_or_path:
-        torch_dtype = (
-            model_args.torch_dtype
-            if model_args.torch_dtype in ["auto", None]
-            else getattr(torch, model_args.torch_dtype)
-        )
-        model = AutoModelForMaskedLM.from_pretrained(
-            model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            token=model_args.token,
-            trust_remote_code=model_args.trust_remote_code,
-            torch_dtype=torch_dtype,
-            low_cpu_mem_usage=model_args.low_cpu_mem_usage,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
-
-    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
-    # on a small vocab and want a smaller embedding size, remove this test.
-    embedding_size = model.get_input_embeddings().weight.shape[0]
-    if len(tokenizer) > embedding_size:
-        model.resize_token_embeddings(len(tokenizer))
-
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    if training_args.do_train:
-        column_names = list(raw_datasets["train"].features)
-    else:
-        column_names = list(raw_datasets["validation"].features)
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    if data_args.max_seq_length is None:
-        max_seq_length = tokenizer.model_max_length
-        if max_seq_length > 1024:
-            logger.warning(
-                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
-                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
-                " override this default with `--block_size xxx`."
-            )
-            max_seq_length = 1024
-    else:
-        if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warning(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
-                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-            )
-        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    if data_args.line_by_line:
-        # When using line_by_line, we just tokenize each nonempty line.
-        padding = "max_length" if data_args.pad_to_max_length else False
-
-        def tokenize_function(examples):
-            # Remove empty lines
-            examples[text_column_name] = [
-                line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
-            ]
-            return tokenizer(
-                examples[text_column_name],
-                padding=padding,
-                truncation=True,
-                max_length=max_seq_length,
-                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
-                # receives the `special_tokens_mask`.
-                return_special_tokens_mask=True,
-            )
-
-        with training_args.main_process_first(desc="dataset map tokenization"):
-            if not data_args.streaming:
-                tokenized_datasets = raw_datasets.map(
-                    tokenize_function,
-                    batched=True,
-                    num_proc=data_args.preprocessing_num_workers,
-                    remove_columns=[text_column_name],
-                    load_from_cache_file=not data_args.overwrite_cache,
-                    desc="Running tokenizer on dataset line_by_line",
-                )
-            else:
-                tokenized_datasets = raw_datasets.map(
-                    tokenize_function,
-                    batched=True,
-                    remove_columns=[text_column_name],
-                )
-    else:
-        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
-        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
-        # efficient when it receives the `special_tokens_mask`.
-        def tokenize_function(examples):
-            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
-
-        with training_args.main_process_first(desc="dataset map tokenization"):
-            if not data_args.streaming:
-                tokenized_datasets = raw_datasets.map(
-                    tokenize_function,
-                    batched=True,
-                    num_proc=data_args.preprocessing_num_workers,
-                    remove_columns=column_names,
-                    load_from_cache_file=not data_args.overwrite_cache,
-                    desc="Running tokenizer on every text in dataset",
-                )
-            else:
-                tokenized_datasets = raw_datasets.map(
-                    tokenize_function,
-                    batched=True,
-                    remove_columns=column_names,
-                )
-
-        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
-        # max_seq_length.
-        def group_texts(examples):
-            # Concatenate all texts.
-            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-            total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
-            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-            total_length = (total_length // max_seq_length) * max_seq_length
-            # Split by chunks of max_len.
-            result = {
-                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
-                for k, t in concatenated_examples.items()
-            }
-            return result
-
-        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
-        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
-        # might be slower to preprocess.
-        #
-        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/process#map
-
-        with training_args.main_process_first(desc="grouping texts together"):
-            if not data_args.streaming:
-                tokenized_datasets = tokenized_datasets.map(
-                    group_texts,
-                    batched=True,
-                    num_proc=data_args.preprocessing_num_workers,
-                    load_from_cache_file=not data_args.overwrite_cache,
-                    desc=f"Grouping texts in chunks of {max_seq_length}",
-                )
-            else:
-                tokenized_datasets = tokenized_datasets.map(
-                    group_texts,
-                    batched=True,
-                )
-
-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = tokenized_datasets["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = tokenized_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-        def preprocess_logits_for_metrics(logits, labels):
-            if isinstance(logits, tuple):
-                # Depending on the model and config, logits may contain extra tensors,
-                # like past_key_values, but logits always come first
-                logits = logits[0]
-            return logits.argmax(dim=-1)
-
-        metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
-
-        def compute_metrics(eval_preds):
-            preds, labels = eval_preds
-            # preds have the same shape as the labels, after the argmax(-1) has been calculated
-            # by preprocess_logits_for_metrics
-            labels = labels.reshape(-1)
-            preds = preds.reshape(-1)
-            mask = labels != -100
-            labels = labels[mask]
-            preds = preds[mask]
-            return metric.compute(predictions=preds, references=labels)
-
-    # Data collator
-    # This one will take care of randomly masking the tokens.
-    pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
-    data_collator = DataCollatorForLanguageModeling(
-        tokenizer=tokenizer,
-        mlm_probability=data_args.mlm_probability,
-        pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
-    )
-
-    # Initialize our Trainer
-    trainer = GaudiTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics if training_args.do_eval else None,
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-        metrics = train_result.metrics
-
-        if data_args.streaming:
-            metrics["train_samples"] = training_args.max_steps * training_args.per_device_train_batch_size
-        else:
-            max_train_samples = (
-                data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-            )
-            metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-
-        if not data_args.streaming:
-            max_eval_samples = (
-                data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-            )
-            metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        try:
-            perplexity = math.exp(metrics["eval_loss"])
-        except OverflowError:
-            perplexity = float("inf")
-        metrics["perplexity"] = perplexity
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/language-modeling/run_prompt_tuning_clm.py b/examples/language-modeling/run_prompt_tuning_clm.py
deleted file mode 100644
index 49043f3930..0000000000
--- a/examples/language-modeling/run_prompt_tuning_clm.py
+++ /dev/null
@@ -1,380 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-"""
-prompt/prefix/p tuning script for causal language modeling
-Adapted from the following sources:
-https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
-https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
-"""
-
-import logging
-import math
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-import transformers
-from datasets import load_dataset
-from peft import (
-    PrefixTuningConfig,
-    PromptEncoderConfig,
-    PromptTuningConfig,
-    PromptTuningInit,
-    TaskType,
-    get_peft_model,
-)
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    HfArgumentParser,
-    default_data_collator,
-)
-from transformers.trainer_utils import is_main_process
-from transformers.utils import check_min_version
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.38.0")
-check_optimum_habana_min_version("1.10.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Don't set it if you want to train a model from"
-                " scratch."
-            )
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    torch_dtype: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
-                "dtype will be automatically derived from the model's weights."
-            ),
-            "choices": ["auto", "bfloat16", "float16", "float32"],
-        },
-    )
-    use_cache: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether or not the model should return the last key/values attentions (not used by all models)."
-                "Only relevant if `config.is_decoder=True`."
-            )
-        },
-    )
-    low_cpu_mem_usage: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
-                "Setting it to True will benefit LLM loading time and RAM consumption."
-            )
-        },
-    )
-    peft_type: str = field(
-        default="prompt_tuning",
-        metadata={
-            "help": ("The PEFT type to use."),
-            "choices": ["p_tuning", "prefix_tuning", "prompt_tuning"],
-        },
-    )
-    num_virtual_tokens: int = field(
-        default=8,
-        metadata={"help": ("the number of virtual tokens used in prompt/prefix/P tuning.")},
-    )
-    encoder_hidden_size: int = field(
-        default=1024,
-        metadata={"help": ("encoder_hidden_size if the encoder hidden size used in P tuning")},
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default="ought/raft", metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default="twitter_complaints",
-        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."},
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-
-    streaming: bool = field(default=False, metadata={"help": "Enable streaming mode."})
-
-    max_seq_length: Optional[int] = field(
-        default=64,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated."
-        },
-    )
-
-    def __post_init__(self):
-        if self.streaming:
-            require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
-
-        if self.dataset_name is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-
-
-def main():
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-
-    set_seed(training_args.seed)
-    if model_args.peft_type == "prompt_tuning":
-        peft_config = PromptTuningConfig(
-            task_type=TaskType.CAUSAL_LM,
-            prompt_tuning_init=PromptTuningInit.TEXT,
-            num_virtual_tokens=model_args.num_virtual_tokens,
-            prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
-            tokenizer_name_or_path=model_args.model_name_or_path,
-        )
-    elif model_args.peft_type == "p_tuning":
-        peft_config = PromptEncoderConfig(
-            task_type=TaskType.CAUSAL_LM,
-            num_virtual_tokens=model_args.num_virtual_tokens,
-            encoder_hidden_size=model_args.encoder_hidden_size,
-        )
-    elif model_args.peft_type == "prefix_tuning":
-        peft_config = PrefixTuningConfig(
-            task_type=TaskType.CAUSAL_LM,
-            num_virtual_tokens=model_args.num_virtual_tokens,
-        )
-
-    max_length = data_args.max_seq_length
-    dataset = load_dataset(
-        data_args.dataset_name,
-        data_args.dataset_config_name,
-        cache_dir=model_args.cache_dir,
-        token=model_args.token,
-        streaming=data_args.streaming,
-    )
-    if data_args.dataset_name == "ought/raft" and data_args.dataset_config_name == "twitter_complaints":
-        text_column = "Tweet text"
-        label_column = "text_label"
-    else:
-        raise ValueError("preprocess is only for ought/raft twitter_complaints now")
-    classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
-    dataset = dataset.map(
-        lambda x: {"text_label": [classes[label] for label in x["Label"]]},
-        batched=True,
-        num_proc=1,
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-
-    def preprocess_function(examples):
-        batch_size = len(examples[text_column])
-        inputs = [f"{x} Label : " for x in examples[text_column]]
-        targets = [str(x) for x in examples[label_column]]
-        model_inputs = tokenizer(inputs)
-        labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
-        for i in range(batch_size):
-            sample_input_ids = model_inputs["input_ids"][i]
-            label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
-            model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
-            labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
-            model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
-        for i in range(batch_size):
-            sample_input_ids = model_inputs["input_ids"][i]
-            label_input_ids = labels["input_ids"][i]
-            model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
-                max_length - len(sample_input_ids)
-            ) + sample_input_ids
-            model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
-                "attention_mask"
-            ][i]
-            labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
-            model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
-            model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
-            labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
-        model_inputs["labels"] = labels["input_ids"]
-        return model_inputs
-
-    processed_datasets = dataset.map(
-        preprocess_function,
-        batched=True,
-        num_proc=1,
-        remove_columns=dataset["train"].column_names,
-        load_from_cache_file=False,
-        desc="Running tokenizer on dataset",
-    )
-
-    train_dataset = processed_datasets["train"]
-    eval_dataset = processed_datasets["train"]
-
-    config_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "revision": model_args.model_revision,
-        "trust_remote_code": True if model_args.trust_remote_code else None,
-        "use_cache": False if training_args.gradient_checkpointing else model_args.use_cache,
-        "token": model_args.token,
-    }
-    # creating model
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
-    else:
-        raise ValueError("Please provide value for model_name_or_path or config_name.")
-    model_dtype = torch.bfloat16 if training_args.bf16 else None
-    model = AutoModelForCausalLM.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        trust_remote_code=True if model_args.trust_remote_code else None,
-        torch_dtype=model_dtype,
-        low_cpu_mem_usage=model_args.low_cpu_mem_usage,
-        token=model_args.token,
-    )
-    model = get_peft_model(model, peft_config)
-    model.print_trainable_parameters()
-
-    # training and evaluation
-    gaudi_config = GaudiConfig()
-    gaudi_config.use_fused_adam = True
-    gaudi_config.use_fused_clip_norm = True
-
-    # Initialize our Trainer
-    trainer = GaudiTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        data_collator=default_data_collator,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        tokenizer=tokenizer,
-    )
-
-    if training_args.do_train:
-        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-        trainer.save_model()
-
-        metrics = train_result.metrics
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-
-        # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        try:
-            perplexity = math.exp(metrics["eval_loss"])
-        except OverflowError:
-            perplexity = float("inf")
-        metrics["perplexity"] = perplexity
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/multi-node-training/EFA/.deepspeed_env b/examples/multi-node-training/EFA/.deepspeed_env
deleted file mode 100644
index 258f1a0f67..0000000000
--- a/examples/multi-node-training/EFA/.deepspeed_env
+++ /dev/null
@@ -1,3 +0,0 @@
-HCCL_OVER_OFI=1
-HCCL_SOCKET_IFNAME=ens32
-LD_LIBRARY_PATH=/root/hccl_ofi_wrapper:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib
diff --git a/examples/multi-node-training/EFA/Dockerfile b/examples/multi-node-training/EFA/Dockerfile
deleted file mode 100644
index 919c015be5..0000000000
--- a/examples/multi-node-training/EFA/Dockerfile
+++ /dev/null
@@ -1,26 +0,0 @@
-FROM vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-
-# Installs pdsh and upgrade pip
-RUN apt-get update && apt-get install -y pdsh && \
-   python -m pip install --upgrade pip
-
-# Installs hccl_ofi_wrapper to interact with libfabric to utilize HW and networking mode (EFA)
-ARG OFI_WRAPPER_WS="/root/hccl_ofi_wrapper"
-RUN git clone "https://github.com/HabanaAI/hccl_ofi_wrapper.git" "${OFI_WRAPPER_WS}" && \
-   cd "${OFI_WRAPPER_WS}" && \
-   LIBFABRIC_ROOT=/opt/amazon/efa make
-
-# Docker ssh port setup
-RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \
-   sed -i 's/#   Port 22/    Port 3022/g' /etc/ssh/ssh_config && \
-   sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
-   service ssh restart
-
-# Installs Optimum Habana and Habana's fork of DeepSpeed
-RUN pip install optimum[habana] && \
-   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-
-CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \
-   chmod 600 ~/.ssh/id_rsa && \
-   cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys && \
-   /bin/bash
diff --git a/examples/multi-node-training/GaudiNIC/Dockerfile b/examples/multi-node-training/GaudiNIC/Dockerfile
deleted file mode 100644
index c8d9f5ad62..0000000000
--- a/examples/multi-node-training/GaudiNIC/Dockerfile
+++ /dev/null
@@ -1,20 +0,0 @@
-FROM vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-
-# Installs pdsh and upgrade pip
-RUN apt-get update && apt-get install -y pdsh && \
-   python -m pip install --upgrade pip
-
-# Docker ssh port setup
-RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \
-   sed -i 's/#   Port 22/    Port 3022/g' /etc/ssh/ssh_config && \
-   sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
-   service ssh restart
-
-# Installs Optimum Habana and Habana's fork of DeepSpeed
-RUN pip install optimum[habana] && \
-   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-
-CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \
-   chmod 600 ~/.ssh/id_rsa && \
-   cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys && \
-   /bin/bash
diff --git a/examples/multi-node-training/README.md b/examples/multi-node-training/README.md
deleted file mode 100644
index 0e40e616f8..0000000000
--- a/examples/multi-node-training/README.md
+++ /dev/null
@@ -1,124 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Multi-node Training
-
-Multi-node training can be performed easily on Gaudi with DeepSpeed for any training script as follows:
-```bash
-python gaudi_spawn.py \
-    --hostfile path_to_my_hostfile --use_deepspeed \
-    path_to_my_script.py --args1 --args2 ... --argsN \
-    --deepspeed path_to_my_deepspeed_config
-```
-where `--argX` is an argument of the script to run.
-
-## Setup
-
-Check out the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/multi_node_training) to know how to set up your Gaudi instances for multi-node runs on premises or on AWS.
-
-We provide two `Dockerfile` to easily start your multi-node runs:
-- A `Dockerfile` provided [here](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training/EFA/Dockerfile) for multi-node runs on AWS.
-- A `Dockerfile` provided [here](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training/GaudiNIC/Dockerfile) for multi-node runs using GaudiNIC.
-
-
-The Dockerfile is based on an image compatible with Ubuntu 22.04 but you can easily adapt it to another OS.
-
-To build the Docker image, run:
-```bash
-docker build -t gaudi_multi_node PATH
-```
-where `PATH` is the path to the folder containing the `Dockerfile`.
-
-To run a Docker container with the image you just built, execute:
-```bash
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host gaudi_multi_node:latest
-```
-
-> For AWS DL1 instances, `--privileged` must be passed to the `docker run` command so that EFA interfaces are visible.
-
-You will need to copy the leader node Docker's `id_rsa.pub` key to every other node Docker's `~/.ssh/authorized_keys` to enable password-less SSH:
-
-  a. Copy `id_rsa.pub` to `~/.ssh/authorized_keys` on each node
-   ```bash
-   cat id_rsa.pub > authorized_keys
-   vi authorized_keys
-   ```
-   b. Copy the leader node's `id_rsa.pub` key contents to other systems' `authorized_keys`.
-
-
-Finally, on each system, add all hosts (including itself) to `known_hosts`. The IP addresses used below are just for illustration:
-   ```bash
-   ssh-keyscan -p 3022 -H 10.10.100.101 >> ~/.ssh/known_hosts
-   ssh-keyscan -p 3022 -H 10.10.100.102 >> ~/.ssh/known_hosts
-   ssh-keyscan -p 3022 -H 10.10.100.103 >> ~/.ssh/known_hosts
-   ssh-keyscan -p 3022 -H 10.10.100.104 >> ~/.ssh/known_hosts
-   ```
-
-You can check if ssh port is working with the following command:
-
-1. Run `lsof -i` inside docker of each node to make sure sshd is up. It should be something like below.
-```bash
-COMMAND PID USER   FD   TYPE   DEVICE SIZE/OFF NODE NAME
-sshd     35 root    3u  IPv4 23262521      0t0  TCP *:3022 (LISTEN)
-sshd     35 root    4u  IPv6 23262523      0t0  TCP *:3022 (LISTEN)
-```
-If no sshd, then do the following to restart sshd.
-```bash
-sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config
-sed -i 's/#   Port 22/    Port 3022/g' /etc/ssh/ssh_config
-sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
-service ssh restart
-```
-2. Test ssh with command `ssh -p 3022 IP-address` to each other to make sure the nodes can communicate with each other.
-
-3. Try gaudi_spawn.py training command with world_size 8 for few steps to make sure the command works for 8 ranks on each node.
-
-4. Start gaudi_spawn.py with multi-nodes run on main node docker. (the node with the 1st ip address in the hostfile)
-
-
-## Hostfile
-
-DeepSpeed requires a [hostfile](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) to know the addresses of and the number of devices to use on each node. You can specify its path with `--hostfile`. This file should look like this:
-```
-ip_1 slots=8
-ip_2 slots=8
-...
-ip_n slots=8
-```
-
-You can find a template [here](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training/hostfile).
-
-
-## Environment variables
-
-If you need to set environment variables for all nodes, you can specify them in a `.deepspeed_env` file which should be located in the local path you are executing from or in your home directory. It is formatted as follows:
-```
-env_variable_1_name=value
-env_variable_2_name=value
-...
-```
-
-You can find an example for AWS instances [here](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training/EFA/.deepspeed_env).
-
-> Note that one should set `HCCL_OVER_OFI=1` and `LD_LIBRARY_PATH=/root/hccl_ofi_wrapper:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib` only on AWS DL1 instances. *These should not be used otherwise*.
-
-
-## Recommendations
-
-- It is strongly recommended to use gradient checkpointing for multi-node runs to get the highest speedups. You can enable it with `--gradient_checkpointing` in all these examples or with `gradient_checkpointing=True` in your `GaudiTrainingArguments`.
-- Larger batch sizes should lead to higher speedups.
-- Multi-node inference is not recommended and can provide inconsistent results.
-- On AWS DL1 instances, run your Docker containers with the `--privileged` flag so that EFA devices are visible.
diff --git a/examples/multi-node-training/hostfile b/examples/multi-node-training/hostfile
deleted file mode 100644
index 3936c43ac7..0000000000
--- a/examples/multi-node-training/hostfile
+++ /dev/null
@@ -1,4 +0,0 @@
-ip1 slots=8
-ip2 slots=8
-ip3 slots=8
-ip4 slots=8
diff --git a/examples/object-segementation/README.md b/examples/object-segementation/README.md
deleted file mode 100644
index 4afb598492..0000000000
--- a/examples/object-segementation/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-<!---
-Copyright 2024 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Object Segmentation Examples
-
-This directory contains an example script that demonstrates how to perform object segmentation on Gaudi with graph mode.
-
-## Single-HPU inference
-
-```bash
-python3 run_example.py \
-    --model_name_or_path "CIDAS/clipseg-rd64-refined" \
-    --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" \
-    --prompt "cat, remote, blanket" \
-    --warmup 3 \
-    --n_iterations 20 \
-    --use_hpu_graphs \
-    --bf16 \
-    --print_result
-```
-Models that have been validated:
-  - [clipseg-rd64-refined ](https://huggingface.co/CIDAS/clipseg-rd64-refined)
\ No newline at end of file
diff --git a/examples/object-segementation/run_example.py b/examples/object-segementation/run_example.py
deleted file mode 100644
index 9a36c50b96..0000000000
--- a/examples/object-segementation/run_example.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# Copied from https://huggingface.co/docs/transformers/main/en/model_doc/clipseg
-
-import argparse
-import time
-
-import habana_frameworks.torch as ht
-import requests
-import torch
-from PIL import Image
-from torchvision.utils import save_image
-from transformers import AutoProcessor, CLIPSegForImageSegmentation
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name_or_path",
-        default="CIDAS/clipseg-rd64-refined",
-        type=str,
-        help="Path of the pre-trained model",
-    )
-    parser.add_argument(
-        "--image_path",
-        default="http://images.cocodataset.org/val2017/000000039769.jpg",
-        type=str,
-        help='Path of the input image. Should be a single string (eg: --image_path "URL")',
-    )
-    parser.add_argument(
-        "--prompt",
-        default="a cat,a remote,a blanket",
-        type=str,
-        help='Prompt for classification. It should be a string seperated by comma. (eg: --prompt "a photo of a cat, a photo of a dog")',
-    )
-    parser.add_argument(
-        "--use_hpu_graphs",
-        action="store_true",
-        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Whether to use bf16 precision for classification.",
-    )
-    parser.add_argument(
-        "--print_result",
-        action="store_true",
-        help="Whether to print the classification results.",
-    )
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
-    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
-
-    args = parser.parse_args()
-
-    adapt_transformers_to_gaudi()
-
-    processor = AutoProcessor.from_pretrained(args.model_name_or_path)
-    model = CLIPSegForImageSegmentation.from_pretrained(
-        args.model_name_or_path
-    )  # Use CLIPSegForImageSegmentation instead of automodel.
-    #  The output will contains the logits which are required to generated segmented images
-
-    image = Image.open(requests.get(args.image_path, stream=True).raw)
-    texts = []
-    for text in args.prompt.split(","):
-        texts.append(text)
-
-    if args.use_hpu_graphs:
-        model = ht.hpu.wrap_in_hpu_graph(model)
-
-    autocast = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=args.bf16)
-    model.to("hpu")
-
-    with torch.no_grad(), autocast:
-        for i in range(args.warmup):
-            inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to("hpu")
-            outputs = model(**inputs)
-            torch.hpu.synchronize()
-
-        total_model_time = 0
-        for i in range(args.n_iterations):
-            inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to("hpu")
-            model_start_time = time.time()
-            outputs = model(**inputs)
-            torch.hpu.synchronize()
-            model_end_time = time.time()
-            total_model_time = total_model_time + (model_end_time - model_start_time)
-
-            if args.print_result:
-                if i == 0:  # generate/output once only
-                    logits = outputs.logits
-                    for j in range(logits.shape[0]):
-                        threshold = 0.5
-                        segmented_image = ((torch.sigmoid(logits[j]) > threshold) * 255).unsqueeze(0)
-                        segmented_image = segmented_image.to(torch.float32)
-                        save_image(segmented_image, "segmented_" + texts[j].strip() + ".png")
-                    print("Segmented images are generated.")
-
-    print("n_iterations: " + str(args.n_iterations))
-    print("Total latency (ms): " + str(total_model_time * 1000))
-    print("Average latency (ms): " + str(total_model_time * 1000 / args.n_iterations))
diff --git a/examples/protein-folding/README.md b/examples/protein-folding/README.md
deleted file mode 100644
index d5003e1e41..0000000000
--- a/examples/protein-folding/README.md
+++ /dev/null
@@ -1,85 +0,0 @@
-<!---
-Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# ESMFold Example
-
-ESMFold ([paper link](https://www.biorxiv.org/content/10.1101/2022.07.20.500902v2)) is a recently released protein folding model from FAIR. Unlike other protein folding models, it does not require external databases or search tools to predict structures, and is up to 60X faster as a result.
-
-The port to the Hugging Face Transformers library is even easier to use, as we've removed the dependency on tools like openfold - once you run `pip install transformers`, you're ready to use this model!
-
-Note that all the code that follows will be running the model locally, rather than calling an external API. This means that no rate limiting applies here - you can predict as many structures as your computer can handle.
-
-## Single-HPU inference
-
-Here we show how to predict the folding of a single chain on HPU:
-
-```bash
-python run_esmfold.py
-```
-The predicted protein structure will be stored in save-hpu.pdb file. We can use some tools like py3Dmol to visualize it.
-
-
-# Mila-Intel protST example
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Single-HPU inference for zero shot evaluation
-Here we show how to run zero shot evaluation of protein ST model on HPU:
-
-```bash
-python run_zero_shot_eval.py --bf16 --max_seq_length 1024
-```
-## Multi-HPU finetune for sequence classification task
-
-```bash
-python ../gaudi_spawn.py --world_size 8 --use_mpi run_sequence_classification.py \
-    --output_dir ./out \
-    --model_name_or_path mila-intel/protst-esm1b-for-sequential-classification \
-    --tokenizer_name facebook/esm1b_t33_650M_UR50S \
-    --trust_remote_code \
-    --dataset_name mila-intel/ProtST-BinaryLocalization \
-    --torch_dtype bfloat16 \
-    --overwrite_output_dir \
-    --do_train \
-    --per_device_train_batch_size 32 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 5e-05 \
-    --weight_decay 0 \
-    --num_train_epochs 100 \
-    --lr_scheduler_type constant \
-    --do_eval \
-    --evaluation_strategy epoch \
-    --per_device_eval_batch_size 32 \
-    --logging_strategy epoch \
-    --save_strategy epoch \
-    --save_steps 820 \
-    --dataloader_num_workers 0 \
-    --report_to none \
-    --optim adamw_torch \
-    --label_names labels \
-    --load_best_model_at_end \
-    --metric_for_best_model accuracy \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --use_hpu_graphs_for_training
-```
-
diff --git a/examples/protein-folding/requirements.txt b/examples/protein-folding/requirements.txt
deleted file mode 100644
index 9caf8e887c..0000000000
--- a/examples/protein-folding/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-datasets>=2.14.0
-scikit-learn
diff --git a/examples/protein-folding/run_esmfold.py b/examples/protein-folding/run_esmfold.py
deleted file mode 100644
index 096f18055a..0000000000
--- a/examples/protein-folding/run_esmfold.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This script is based on https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb
-import os
-import time
-
-import habana_frameworks.torch.core as htcore
-import torch
-from transformers import AutoTokenizer, EsmForProteinFolding
-from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
-from transformers.models.esm.openfold_utils.protein import Protein as OFProtein
-from transformers.models.esm.openfold_utils.protein import to_pdb
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-os.environ["PT_HPU_ENABLE_H2D_DYNAMIC_SLICE"] = "0"
-os.environ["PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES"] = "1"
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.11.0")
-
-
-def convert_outputs_to_pdb(outputs):
-    """
-    Converts the model outputs to a PDB file.
-
-    This code comes from the original ESMFold repo, and uses some functions from openfold that have been ported to Transformers.
-    """
-    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
-    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
-    final_atom_positions = final_atom_positions.cpu().numpy()
-    final_atom_mask = outputs["atom37_atom_exists"]
-    pdbs = []
-    for i in range(outputs["aatype"].shape[0]):
-        aa = outputs["aatype"][i]
-        pred_pos = final_atom_positions[i]
-        mask = final_atom_mask[i]
-        resid = outputs["residue_index"][i] + 1
-        pred = OFProtein(
-            aatype=aa,
-            atom_positions=pred_pos,
-            atom_mask=mask,
-            residue_index=resid,
-            b_factors=outputs["plddt"][i],
-            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
-        )
-        pdbs.append(to_pdb(pred))
-    return pdbs
-
-
-adapt_transformers_to_gaudi()
-
-steps = 4
-device = torch.device("hpu")
-
-# This is the sequence for human GNAT1.
-# Feel free to substitute your own peptides of interest
-# Depending on memory constraints you may wish to use shorter sequences.
-test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"  # len = 350
-
-tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
-model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=False)
-model = model.to(device)
-
-# Uncomment this line if you're folding longer (over 600 or so) sequences
-model.trunk.set_chunk_size(64)
-
-with torch.no_grad():
-    tk = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)
-    tokenized_input = tk["input_ids"]
-    print(f"ESMFOLD: input shape = {tokenized_input.shape}")
-    tokenized_input = tokenized_input.to(device)
-
-    for batch in range(steps):
-        print(f"ESMFOLD: step {batch} start ...")
-        start = time.time()
-        output = model(tokenized_input)
-        htcore.mark_step()
-        print(f"ESMFOLD: step {batch} duration: {time.time() - start:.03f} seconds")
-
-pdb = convert_outputs_to_pdb(output)
-pdb_file = "save-hpu.pdb"
-with open(pdb_file, "w") as fout:
-    fout.write(pdb[0])
-    print(f"pdb file saved in {pdb_file}")
diff --git a/examples/protein-folding/run_sequence_classification.py b/examples/protein-folding/run_sequence_classification.py
deleted file mode 100644
index 502aec7d12..0000000000
--- a/examples/protein-folding/run_sequence_classification.py
+++ /dev/null
@@ -1,255 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import logging
-from dataclasses import dataclass, field
-from typing import Optional
-
-import numpy as np
-import torch
-import transformers
-from datasets import load_dataset
-from sklearn.metrics import accuracy_score, matthews_corrcoef
-from transformers import AutoModel, AutoTokenizer, HfArgumentParser, Trainer
-from transformers.data.data_collator import DataCollatorWithPadding
-from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.trainer_pt_utils import get_parameter_names
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.11.0")
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def create_optimizer(opt_model, lr_ratio=0.1):
-    head_names = []
-    for n, p in opt_model.named_parameters():
-        if "classifier" in n:
-            head_names.append(n)
-        else:
-            p.requires_grad = False
-    # turn a list of tuple to 2 lists
-    for n, p in opt_model.named_parameters():
-        if n in head_names:
-            assert p.requires_grad
-    backbone_names = []
-    for n, p in opt_model.named_parameters():
-        if n not in head_names and p.requires_grad:
-            backbone_names.append(n)
-
-    decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)  # forbidden layer norm
-    decay_parameters = [name for name in decay_parameters if "bias" not in name]
-    # training_args.learning_rate
-    head_decay_parameters = [name for name in head_names if name in decay_parameters]
-    head_not_decay_parameters = [name for name in head_names if name not in decay_parameters]
-    # training_args.learning_rate * model_config.lr_ratio
-    backbone_decay_parameters = [name for name in backbone_names if name in decay_parameters]
-    backbone_not_decay_parameters = [name for name in backbone_names if name not in decay_parameters]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in opt_model.named_parameters() if (n in head_decay_parameters and p.requires_grad)],
-            "weight_decay": training_args.weight_decay,
-            "lr": training_args.learning_rate,
-        },
-        {
-            "params": [
-                p for n, p in opt_model.named_parameters() if (n in backbone_decay_parameters and p.requires_grad)
-            ],
-            "weight_decay": training_args.weight_decay,
-            "lr": training_args.learning_rate * lr_ratio,
-        },
-        {
-            "params": [
-                p for n, p in opt_model.named_parameters() if (n in head_not_decay_parameters and p.requires_grad)
-            ],
-            "weight_decay": 0.0,
-            "lr": training_args.learning_rate,
-        },
-        {
-            "params": [
-                p for n, p in opt_model.named_parameters() if (n in backbone_not_decay_parameters and p.requires_grad)
-            ],
-            "weight_decay": 0.0,
-            "lr": training_args.learning_rate * lr_ratio,
-        },
-    ]
-    optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
-    optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
-
-    return optimizer
-
-
-def create_scheduler(training_args, optimizer):
-    from transformers.optimization import get_scheduler
-
-    return get_scheduler(
-        training_args.lr_scheduler_type,
-        optimizer=optimizer if optimizer is None else optimizer,
-        num_warmup_steps=training_args.get_warmup_steps(training_args.max_steps),
-        num_training_steps=training_args.max_steps,
-    )
-
-
-def compute_metrics(eval_preds):
-    probs, labels = eval_preds
-    preds = np.argmax(probs, axis=-1)
-    result = {"accuracy": accuracy_score(labels, preds), "mcc": matthews_corrcoef(labels, preds)}
-    return result
-
-
-def preprocess_logits_for_metrics(logits, labels):
-    return torch.softmax(logits, dim=-1)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    trust_remote_code: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    torch_dtype: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
-                "dtype will be automatically derived from the model's weights."
-            ),
-            "choices": ["auto", "bfloat16", "float16", "float32"],
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    max_length: Optional[str] = field(
-        default=1024,
-        metadata={"help": ("the max length that input id will be padded to")},
-    )
-
-
-if __name__ == "__main__":
-    model_args, data_args, training_args = HfArgumentParser(
-        (ModelArguments, DataTrainingArguments, GaudiTrainingArguments)
-    ).parse_args_into_dataclasses()
-
-    transformers.utils.logging.set_verbosity_info()
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    torch_dtype = (
-        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
-    )
-    model = AutoModel.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, torch_dtype=torch_dtype
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
-
-    def tokenize_protein(example, tokenizer=None):
-        protein_seq = example["prot_seq"]
-        protein_seq_str = tokenizer(protein_seq, add_special_tokens=True)
-        example["input_ids"] = protein_seq_str["input_ids"]
-        example["attention_mask"] = protein_seq_str["attention_mask"]
-        example["labels"] = example["localization"]
-        return example
-
-    func_tokenize_protein = functools.partial(tokenize_protein, tokenizer=tokenizer)
-
-    if data_args.dataset_name != "mila-intel/ProtST-BinaryLocalization":
-        raise ValueError("preprocess is only for mila-intel/ProtST-BinaryLocalization now")
-    raw_dataset = load_dataset(data_args.dataset_name)
-    for split in ["train", "validation", "test"]:
-        raw_dataset[split] = raw_dataset[split].map(
-            func_tokenize_protein, batched=False, remove_columns=["Unnamed: 0", "prot_seq", "localization"]
-        )
-
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=data_args.max_length)
-
-    optimizer = create_optimizer(model)
-    scheduler = create_scheduler(training_args, optimizer)
-
-    # build trainer
-    gaudi_config = GaudiConfig()
-    gaudi_config.use_fused_adam = True
-    gaudi_config.use_fused_clip_norm = True
-
-    trainer = GaudiTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=raw_dataset["train"],
-        eval_dataset=raw_dataset["validation"],
-        data_collator=data_collator,
-        optimizers=(optimizer, scheduler),
-        compute_metrics=compute_metrics,
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-    )
-
-    if training_args.do_train:
-        train_result = trainer.train()
-        trainer.save_model()
-        # Saves the tokenizer too for easy upload
-        tokenizer.save_pretrained(training_args.output_dir)
-
-        metrics = train_result.metrics
-        metrics["train_samples"] = len(raw_dataset["train"])
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    if training_args.do_eval:
-        metrics = trainer.evaluate(raw_dataset["test"], metric_key_prefix="test")
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
-
-        metrics = trainer.evaluate(raw_dataset["validation"], metric_key_prefix="eval")
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
diff --git a/examples/protein-folding/run_zero_shot_eval.py b/examples/protein-folding/run_zero_shot_eval.py
deleted file mode 100644
index 56b6e16411..0000000000
--- a/examples/protein-folding/run_zero_shot_eval.py
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import functools
-import json
-import logging
-import sys
-
-import torch
-from datasets import load_dataset
-from tqdm import tqdm
-from transformers import AutoModel, AutoTokenizer
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.11.0")
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-
-logger = logging.getLogger(__name__)
-
-
-def parse_args(args):
-    parser = argparse.ArgumentParser(description="Simple example of protST zero shot evaluation.")
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default=None,
-        help="output dir",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        type=int,
-        default=1024,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        "--bf16", action="store_true", help="Whether to perform zero shot evaluation in bf16 precision."
-    )
-
-    return parser.parse_args(args)
-
-
-def tokenize_protein(example, protein_tokenizer=None, max_seq_length=None):
-    protein_seqs = example["prot_seq"]
-
-    protein_inputs = protein_tokenizer(
-        protein_seqs, padding="max_length", truncation=True, add_special_tokens=True, max_length=max_seq_length
-    )
-    example["protein_input_ids"] = protein_inputs.input_ids
-    example["protein_attention_mask"] = protein_inputs.attention_mask
-
-    return example
-
-
-def label_embedding(labels, text_tokenizer, text_model, device):
-    # embed label descriptions
-    label_feature = []
-    with torch.inference_mode():
-        for label in labels:
-            label_input_ids = text_tokenizer.encode(
-                label, max_length=128, truncation=True, add_special_tokens=False, padding="max_length"
-            )
-            label_input_ids = [text_tokenizer.cls_token_id] + label_input_ids
-            label_input_ids = torch.tensor(label_input_ids, dtype=torch.long, device=device).unsqueeze(0)
-            attention_mask = label_input_ids != text_tokenizer.pad_token_id
-            attention_mask = attention_mask.to(device)
-            text_outputs = text_model(label_input_ids, attention_mask=attention_mask)
-
-            label_feature.append(text_outputs["text_feature"].clone())
-    label_feature = torch.cat(label_feature, dim=0)
-    label_feature = label_feature / label_feature.norm(dim=-1, keepdim=True)
-
-    return label_feature
-
-
-def zero_shot_eval(logger, device, test_dataset, target_field, protein_model, logit_scale, label_feature):
-    # get prediction and target
-    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
-    preds, targets = [], []
-    with torch.inference_mode():
-        for data in tqdm(test_dataloader):
-            target = data[target_field]
-            targets.append(target)
-
-            protein_input_ids = torch.tensor(data["protein_input_ids"], dtype=torch.long, device=device).unsqueeze(0)
-            attention_mask = torch.tensor(data["protein_attention_mask"], dtype=torch.long, device=device).unsqueeze(0)
-            protein_outputs = protein_model(protein_input_ids, attention_mask=attention_mask)
-            protein_feature = protein_outputs["protein_feature"]
-            protein_feature = protein_feature / protein_feature.norm(dim=-1, keepdim=True)
-            pred = logit_scale * protein_feature @ label_feature.t()
-            preds.append(pred)
-    preds = torch.cat(preds, dim=0)
-    targets = torch.tensor(targets, dtype=torch.long, device=device)
-    accuracy = (preds.argmax(dim=-1) == targets).float().mean().item()
-    logger.info("Zero-shot accuracy: %.6f" % accuracy)
-    return accuracy
-
-
-def main(args):
-    args = parse_args(args)
-    adapt_transformers_to_gaudi()
-
-    device = torch.device("hpu")
-    model_dtype = torch.bfloat16 if args.bf16 else None
-    protst_model = AutoModel.from_pretrained(
-        "mila-intel/ProtST-esm1b", trust_remote_code=True, torch_dtype=model_dtype
-    ).to(device)
-    protein_model = protst_model.protein_model
-    text_model = protst_model.text_model
-    logit_scale = protst_model.logit_scale
-
-    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-    protein_model = wrap_in_hpu_graph(protein_model)
-    text_model = wrap_in_hpu_graph(text_model)
-    logit_scale.requires_grad = False
-    logit_scale = logit_scale.to(device)
-    logit_scale = logit_scale.exp()
-
-    protein_tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
-    text_tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
-
-    raw_datasets = load_dataset("mila-intel/ProtST-SubcellularLocalization", split="test")
-    func_tokenize_protein = functools.partial(
-        tokenize_protein, protein_tokenizer=protein_tokenizer, max_seq_length=args.max_seq_length
-    )
-    test_dataset = raw_datasets.map(
-        func_tokenize_protein,
-        batched=False,
-        remove_columns=["prot_seq"],
-        desc="Running tokenize_proteins on dataset",
-    )
-
-    labels = load_dataset("mila-intel/subloc_template")["train"]["name"]
-
-    text_tokenizer.encode(labels[0], max_length=128, truncation=True, add_special_tokens=False)
-    label_feature = label_embedding(labels, text_tokenizer, text_model, device)
-    accuracy = zero_shot_eval(logger, device, test_dataset, "localization", protein_model, logit_scale, label_feature)
-    if args.output_dir is not None:
-        metrics = {"accuracy": accuracy}
-        with open(f"{args.output_dir}/accuracy_metrics.json", mode="w") as file:
-            json.dump(metrics, file)
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
deleted file mode 100755
index d531bd9fcd..0000000000
--- a/examples/question-answering/README.md
+++ /dev/null
@@ -1,256 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Question Answering Examples on SQuAD
-
-Based on the script [`run_qa.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa.py).
-
-**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
-uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
-[this table](https://huggingface.co/transformers/index.html#supported-frameworks).
-
-`run_qa.py` allows you to fine-tune any supported model on the SQUAD dataset or another question-answering dataset of the `datasets` library or your own csv/jsonlines files as long as they are structured the same way as SQUAD. You might need to tweak the data processing inside the script if your data is structured differently.
-
-Note that if your dataset contains samples with no possible answers (like SQUAD version 2), you need to pass along the flag `--version_2_with_negative`.
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Fine-tuning BERT on SQuAD1.1
-
-For the following cases, an example of a Gaudi configuration file is given
-[here](https://github.com/huggingface/optimum-habana#how-to-use-it).
-
-
-### Single-card Training
-
-This example code fine-tunes BERT on the SQuAD1.1 dataset.
-
-```bash
-PT_HPU_LAZY_MODE=0 python run_qa.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 24 \
-  --per_device_eval_batch_size 8 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad/ \
-  --use_habana \
-  --torch_compile_backend hpu_backend \
-  --torch_compile \
-  --use_lazy_mode false \
-  --throughput_warmup_steps 3 \
-  --bf16
-```
-
-
-### Multi-card Training
-
-Here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using the `run_qa` script, with 8 HPUs:
-
-```bash
-PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_qa.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 24 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir /tmp/squad_output/ \
-    --use_habana \
-    --torch_compile_backend hpu_backend \
-    --torch_compile \
-    --use_lazy_mode false \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-
-### Using DeepSpeed
-
-Similarly to multi-card training, here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using DeepSpeed with 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_qa.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 24 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir /tmp/squad_output/ \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-
-### Training in torch.compile mode
-
-Albert XXL model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command, \
-a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`. \
-b) Run the above commands with `--model_name_or_path albert-xxlarge-v1`, `--use_lazy_mode False` and add `--torch_compile`, `--torch_compile_backend hpu_backend` and remove `--use_hpu_graphs_for_inference` flags.
-
-
-## Fine-tuning Llama on SQuAD1.1
-
-> [!NOTE]
->   Llama/Llama2 for question answering requires Transformers v4.38.0 or newer, which supports the `LlamaForQuestionAnswering` class.
-
-Here is a command you can run to train a Llama model for question answering:
-```bash
-python ../gaudi_spawn.py \
-  --world_size 8 --use_deepspeed run_qa.py \
-  --model_name_or_path FlagAlpha/Llama2-Chinese-13b-Chat \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 8 \
-  --per_device_eval_batch_size 8 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad_output/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --throughput_warmup_steps 3 \
-  --max_train_samples 45080 \
-  --deepspeed ../../tests/configs/deepspeed_zero_2.json
-```
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with BERT on SQuAD on 1 Gaudi card with the following command:
-```bash
-python run_qa.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --dataset_name squad \
-  --do_eval \
-  --per_device_eval_batch_size 8 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/squad/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --bf16
-```
-
-
-## Recommended Hyperparameters for Mixed Precision
-
-| | learning_rate | num_train_epochs | per_device_train_batch_size | per_device_eval_batch_size |
-|----------------------------|:----:|:--:|:-:|:-:|
-| BERT base                  | 3e-5 | 2 | 24 | 8 |
-| BERT large                 | 3e-5 | 2 | 24 | 8 |
-| RoBERTa base               | 3e-5 | 2 | 12 | 8 |
-| RoBERTa large              | 3e-5 | 2 | 12 | 8 |
-| ALBERT large (single-card) | 5e-5 | 2 | 32 | 4 |
-| ALBERT large (multi-card)  | 6e-5 | 2 | 32 | 4 |
-| ALBERT XXL (single-card)   | 5e-6 | 2 | 16 | 2 |
-| ALBERT XXL (multi-card)    | 5e-5 | 2 | 16 | 2 |
-| DistilBERT                 | 5e-5 | 3 | 8  | 8 |
-| meta-llama/Llama-2-13b-chat-hf (multi-card) | 3e-5 | 2 | 8 | 8 |
-| FlagAlpha/Llama2-Chinese-13b-Chat (multi-card) | 3e-5 | 2 | 8 | 8 |
-
-
-## Fine-tuning T5 on SQuAD2.0
-
-The [`run_seq2seq_qa.py`](https://github.com/huggingface/optimum-habana/blob/main/examples/question-answering/run_seq2seq_qa.py) script is meant for encoder-decoder (also called seq2seq) Transformer models, such as T5 or BART. These models are generative, rather than discriminative. This means that they learn to generate the correct answer, rather than predicting the start and end position of the tokens of the answer.
-
-The following command fine-tunes T5 on the SQuAD2.0 dataset:
-
-```bash
-python run_seq2seq_qa.py \
-  --model_name_or_path t5-small \
-  --gaudi_config_name Habana/t5 \
-  --dataset_name squad_v2 \
-  --version_2_with_negative \
-  --context_column context \
-  --question_column question \
-  --answer_column answers \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 16 \
-  --per_device_eval_batch_size 33 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/seq2seq_squad/ \
-  --predict_with_generate \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --ignore_pad_token_for_loss False \
-  --pad_to_max_length \
-  --save_strategy epoch \
-  --throughput_warmup_steps 3 \
-  --bf16
-```
-
-For multi-card and DeepSpeed runs, you can use `python ../gaudi_spawn.py --world_size 8 --use_mpi` and `python ../gaudi_spawn.py --world_size 8 --use_deepspeed` as shown in the previous sections.
diff --git a/examples/question-answering/fsdp_config.json b/examples/question-answering/fsdp_config.json
deleted file mode 100644
index 27e9aeaf77..0000000000
--- a/examples/question-answering/fsdp_config.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-    "fsdp_backward_prefetch": "BACKWARD_PRE",
-    "fsdp_forward_prefetch": false,
-    "fsdp_offload_params": false,
-    "fsdp_sharding_strategy": 1,
-    "fsdp_state_dict_type": "FULL_STATE_DICT",
-    "fsdp_sync_module_states": true,
-    "fsdp_use_orig_params": true,
-    "transformer_layer_cls_to_wrap": "BertLayer",
-    "fsdp_activation_checkpointing": false
-}
diff --git a/examples/question-answering/requirements.txt b/examples/question-answering/requirements.txt
deleted file mode 100644
index 3c204b21c7..0000000000
--- a/examples/question-answering/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-datasets >= 2.4.0, <= 2.19.2
-torch >= 1.3.0, <= 2.4.0
-evaluate == 0.4.2
diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
deleted file mode 100644
index e58f7f42a2..0000000000
--- a/examples/question-answering/run_qa.py
+++ /dev/null
@@ -1,732 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer.
-"""
-# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
-
-import logging
-import os
-import sys
-import warnings
-from dataclasses import dataclass, field
-from typing import Optional
-
-import datasets
-import evaluate
-import transformers
-from datasets import load_dataset
-from trainer_qa import QuestionAnsweringTrainer
-from transformers import (
-    AutoConfig,
-    AutoModelForQuestionAnswering,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    HfArgumentParser,
-    PreTrainedTokenizerFast,
-    default_data_collator,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions
-
-from optimum.habana import GaudiConfig, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_seq_length: int = field(
-        default=384,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. If False, will pad the samples dynamically when"
-                " batching to the maximum length in the batch (which can be faster on GPU but will be slower on HPU)."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    version_2_with_negative: bool = field(
-        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
-    )
-    null_score_diff_threshold: float = field(
-        default=0.0,
-        metadata={
-            "help": (
-                "The threshold used to select the null answer: if the best answer has a score that is less than "
-                "the score of the null answer minus this threshold, the null answer is selected for this example. "
-                "Only useful when `version_2_with_negative=True`."
-            )
-        },
-    )
-    doc_stride: int = field(
-        default=128,
-        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
-    )
-    n_best_size: int = field(
-        default=20,
-        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
-    )
-    max_answer_length: int = field(
-        default=30,
-        metadata={
-            "help": (
-                "The maximum length of an answer that can be generated. This is needed because the start "
-                "and end predictions are not conditioned on one another."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if (
-            self.dataset_name is None
-            and self.train_file is None
-            and self.validation_file is None
-            and self.test_file is None
-        ):
-            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.test_file is not None:
-                extension = self.test_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_qa", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            field="data",
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=True,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    if config.model_type == "llama":
-        if tokenizer.pad_token is None:
-            tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        tokenizer.cls_token = tokenizer.bos_token
-    model = AutoModelForQuestionAnswering.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    # Tokenizer check: this script requires a fast tokenizer.
-    if not isinstance(tokenizer, PreTrainedTokenizerFast):
-        raise ValueError(
-            "This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
-            " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
-            " this requirement"
-        )
-
-    # Preprocessing the datasets.
-    # Preprocessing is slightly different for training and evaluation.
-    if training_args.do_train:
-        column_names = raw_datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = raw_datasets["validation"].column_names
-    else:
-        column_names = raw_datasets["test"].column_names
-    question_column_name = "question" if "question" in column_names else column_names[0]
-    context_column_name = "context" if "context" in column_names else column_names[1]
-    answer_column_name = "answers" if "answers" in column_names else column_names[2]
-
-    # Padding side determines if we do (question|context) or (context|question).
-    pad_on_right = tokenizer.padding_side == "right"
-
-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    # Training preprocessing
-    def prepare_train_features(examples):
-        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
-        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
-        # left whitespace
-        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
-
-        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
-        # in one example possible giving several features when a context is long, each of those features having a
-        # context that overlaps a bit the context of the previous feature.
-        tokenized_examples = tokenizer(
-            examples[question_column_name if pad_on_right else context_column_name],
-            examples[context_column_name if pad_on_right else question_column_name],
-            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
-            stride=data_args.doc_stride,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length" if data_args.pad_to_max_length else False,
-        )
-
-        # Since one example might give us several features if it has a long context, we need a map from a feature to
-        # its corresponding example. This key gives us just that.
-        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-        # The offset mappings will give us a map from token to character position in the original context. This will
-        # help us compute the start_positions and end_positions.
-        offset_mapping = tokenized_examples.pop("offset_mapping")
-
-        # Let's label those examples!
-        tokenized_examples["start_positions"] = []
-        tokenized_examples["end_positions"] = []
-
-        for i, offsets in enumerate(offset_mapping):
-            # We will label impossible answers with the index of the CLS token.
-            input_ids = tokenized_examples["input_ids"][i]
-            if tokenizer.cls_token_id in input_ids:
-                cls_index = input_ids.index(tokenizer.cls_token_id)
-            elif tokenizer.bos_token_id in input_ids:
-                cls_index = input_ids.index(tokenizer.bos_token_id)
-            else:
-                cls_index = 0
-
-            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-            sequence_ids = tokenized_examples.sequence_ids(i)
-
-            # One example can give several spans, this is the index of the example containing this span of text.
-            sample_index = sample_mapping[i]
-            answers = examples[answer_column_name][sample_index]
-            # If no answers are given, set the cls_index as answer.
-            if len(answers["answer_start"]) == 0:
-                tokenized_examples["start_positions"].append(cls_index)
-                tokenized_examples["end_positions"].append(cls_index)
-            else:
-                # Start/end character index of the answer in the text.
-                start_char = answers["answer_start"][0]
-                end_char = start_char + len(answers["text"][0])
-
-                # Start token index of the current span in the text.
-                token_start_index = 0
-                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
-                    token_start_index += 1
-
-                # End token index of the current span in the text.
-                token_end_index = len(input_ids) - 1
-                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
-                    token_end_index -= 1
-
-                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
-                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
-                    tokenized_examples["start_positions"].append(cls_index)
-                    tokenized_examples["end_positions"].append(cls_index)
-                else:
-                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
-                    # Note: we could go after the last offset if the answer is the last word (edge case).
-                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
-                        token_start_index += 1
-                    tokenized_examples["start_positions"].append(token_start_index - 1)
-                    while offsets[token_end_index][1] >= end_char:
-                        token_end_index -= 1
-                    tokenized_examples["end_positions"].append(token_end_index + 1)
-
-        return tokenized_examples
-
-    if training_args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = raw_datasets["train"]
-        if data_args.max_train_samples is not None:
-            # We will select sample from whole data if argument is specified
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-        # Create train feature from dataset
-        with training_args.main_process_first(desc="train dataset map pre-processing"):
-            train_dataset = train_dataset.map(
-                prepare_train_features,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on train dataset",
-            )
-        if data_args.max_train_samples is not None:
-            # Number of samples might increase during Feature Creation, We select only specified max samples
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-    # Validation preprocessing
-    def prepare_validation_features(examples):
-        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
-        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
-        # left whitespace
-        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
-
-        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
-        # in one example possible giving several features when a context is long, each of those features having a
-        # context that overlaps a bit the context of the previous feature.
-        tokenized_examples = tokenizer(
-            examples[question_column_name if pad_on_right else context_column_name],
-            examples[context_column_name if pad_on_right else question_column_name],
-            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
-            stride=data_args.doc_stride,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length" if data_args.pad_to_max_length else False,
-        )
-
-        # Since one example might give us several features if it has a long context, we need a map from a feature to
-        # its corresponding example. This key gives us just that.
-        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-
-        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
-        # corresponding example_id and we will store the offset mappings.
-        tokenized_examples["example_id"] = []
-
-        for i in range(len(tokenized_examples["input_ids"])):
-            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-            sequence_ids = tokenized_examples.sequence_ids(i)
-            context_index = 1 if pad_on_right else 0
-
-            # One example can give several spans, this is the index of the example containing this span of text.
-            sample_index = sample_mapping[i]
-            tokenized_examples["example_id"].append(examples["id"][sample_index])
-
-            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
-            # position is part of the context or not.
-            tokenized_examples["offset_mapping"][i] = [
-                (o if sequence_ids[k] == context_index else None)
-                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
-            ]
-
-        return tokenized_examples
-
-    if training_args.do_eval:
-        if "validation" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_examples = raw_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            # We will select sample from whole data
-            max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
-            eval_examples = eval_examples.select(range(max_eval_samples))
-        # Validation Feature Creation
-        with training_args.main_process_first(desc="validation dataset map pre-processing"):
-            eval_dataset = eval_examples.map(
-                prepare_validation_features,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on validation dataset",
-            )
-        if data_args.max_eval_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    if training_args.do_predict:
-        if "test" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_examples = raw_datasets["test"]
-        if data_args.max_predict_samples is not None:
-            # We will select sample from whole data
-            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
-        # Predict Feature Creation
-        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
-            predict_dataset = predict_examples.map(
-                prepare_validation_features,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on prediction dataset",
-            )
-        if data_args.max_predict_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
-            predict_dataset = predict_dataset.select(range(max_predict_samples))
-
-    # Data collator
-    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
-    # collator.
-    data_collator = (
-        default_data_collator
-        if data_args.pad_to_max_length
-        else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
-    )
-
-    # Post-processing:
-    def post_processing_function(examples, features, predictions, stage="eval"):
-        # Post-processing: we match the start logits and end logits to answers in the original context.
-        predictions = postprocess_qa_predictions(
-            examples=examples,
-            features=features,
-            predictions=predictions,
-            version_2_with_negative=data_args.version_2_with_negative,
-            n_best_size=data_args.n_best_size,
-            max_answer_length=data_args.max_answer_length,
-            null_score_diff_threshold=data_args.null_score_diff_threshold,
-            output_dir=training_args.output_dir,
-            log_level=log_level,
-            prefix=stage,
-        )
-        # Format the result to the format the metric expects.
-        if data_args.version_2_with_negative:
-            formatted_predictions = [
-                {"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
-            ]
-        else:
-            formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()]
-
-        references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples]
-        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
-
-    if data_args.version_2_with_negative:
-        accepted_best_metrics = ("exact", "f1", "HasAns_exact", "HasAns_f1")
-    else:
-        accepted_best_metrics = ("exact_match", "f1")
-
-    if training_args.load_best_model_at_end and training_args.metric_for_best_model not in accepted_best_metrics:
-        warnings.warn(f"--metric_for_best_model should be set to one of {accepted_best_metrics}")
-
-    metric = evaluate.load(
-        "squad_v2" if data_args.version_2_with_negative else "squad", cache_dir=model_args.cache_dir
-    )
-
-    def compute_metrics(p: EvalPrediction):
-        return metric.compute(predictions=p.predictions, references=p.label_ids)
-
-    # Initialize our Trainer
-    trainer = QuestionAnsweringTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        eval_examples=eval_examples if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        post_process_function=post_processing_function,
-        compute_metrics=compute_metrics,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Prediction
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-        results = trainer.predict(predict_dataset, predict_examples)
-        metrics = results.metrics
-
-        max_predict_samples = (
-            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py
deleted file mode 100644
index 50880a1f7c..0000000000
--- a/examples/question-answering/run_seq2seq_qa.py
+++ /dev/null
@@ -1,756 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library's seq2seq models for question answering using the 🤗 Seq2SeqTrainer.
-"""
-# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import List, Optional, Tuple
-
-import datasets
-import evaluate
-import numpy as np
-import transformers
-from datasets import load_dataset
-from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
-from transformers import (
-    AutoConfig,
-    AutoModelForSeq2SeqLM,
-    AutoTokenizer,
-    DataCollatorForSeq2Seq,
-    HfArgumentParser,
-)
-from transformers.trainer_utils import EvalLoopOutput, EvalPrediction, get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    context_column: Optional[str] = field(
-        default="context",
-        metadata={"help": "The name of the column in the datasets containing the contexts (for question answering)."},
-    )
-    question_column: Optional[str] = field(
-        default="question",
-        metadata={"help": "The name of the column in the datasets containing the questions (for question answering)."},
-    )
-    answer_column: Optional[str] = field(
-        default="answers",
-        metadata={"help": "The name of the column in the datasets containing the answers (for question answering)."},
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_seq_length: int = field(
-        default=384,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_answer_length: int = field(
-        default=30,
-        metadata={
-            "help": (
-                "The maximum length of an answer that can be generated. This is needed because the start "
-                "and end predictions are not conditioned on one another."
-            )
-        },
-    )
-    val_max_answer_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_answer_length`. "
-                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
-                "during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. If False, will pad the samples dynamically when"
-                " batching to the maximum length in the batch (which can be faster on GPU but will be slower on HPU)."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    version_2_with_negative: bool = field(
-        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
-    )
-    null_score_diff_threshold: float = field(
-        default=0.0,
-        metadata={
-            "help": (
-                "The threshold used to select the null answer: if the best answer has a score that is less than "
-                "the score of the null answer minus this threshold, the null answer is selected for this example. "
-                "Only useful when `version_2_with_negative=True`."
-            )
-        },
-    )
-    doc_stride: int = field(
-        default=128,
-        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
-    )
-    n_best_size: int = field(
-        default=20,
-        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
-    )
-    num_beams: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
-                "which is used during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    ignore_pad_token_for_loss: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
-        },
-    )
-
-    def __post_init__(self):
-        if (
-            self.dataset_name is None
-            and self.train_file is None
-            and self.validation_file is None
-            and self.test_file is None
-        ):
-            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.test_file is not None:
-                extension = self.test_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
-        if self.val_max_answer_length is None:
-            self.val_max_answer_length = self.max_answer_length
-
-
-question_answering_column_name_mapping = {
-    "squad_v2": ("question", "context", "answer"),
-}
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiSeq2SeqTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_seq2seq_qa", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            field="data",
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    model = AutoModelForSeq2SeqLM.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
-    # on a small vocab and want a smaller embedding size, remove this test.
-    embedding_size = model.get_input_embeddings().weight.shape[0]
-    if len(tokenizer) > embedding_size:
-        model.resize_token_embeddings(len(tokenizer))
-
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-
-    # Preprocessing the datasets.
-    # We need to generate and tokenize inputs and targets.
-    if training_args.do_train:
-        column_names = raw_datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = raw_datasets["validation"].column_names
-    elif training_args.do_predict:
-        column_names = raw_datasets["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-
-    # Get the column names for input/target.
-    dataset_columns = question_answering_column_name_mapping.get(data_args.dataset_name, None)
-    if data_args.question_column is None:
-        question_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        question_column = data_args.question_column
-        if question_column not in column_names:
-            raise ValueError(
-                f"--question_column' value '{data_args.question_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if data_args.context_column is None:
-        context_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        context_column = data_args.context_column
-        if context_column not in column_names:
-            raise ValueError(
-                f"--context_column' value '{data_args.context_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if data_args.answer_column is None:
-        answer_column = dataset_columns[2] if dataset_columns is not None else column_names[2]
-    else:
-        answer_column = data_args.answer_column
-        if answer_column not in column_names:
-            raise ValueError(
-                f"--answer_column' value '{data_args.answer_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    # Temporarily set max_answer_length for training.
-    max_answer_length = data_args.max_answer_length
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
-            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
-        )
-
-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    def preprocess_squad_batch(
-        examples,
-        question_column: str,
-        context_column: str,
-        answer_column: str,
-    ) -> Tuple[List[str], List[str]]:
-        questions = examples[question_column]
-        contexts = examples[context_column]
-        answers = examples[answer_column]
-
-        def generate_input(_question, _context):
-            return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])
-
-        inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
-        targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
-        return inputs, targets
-
-    def preprocess_function(examples):
-        inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
-
-        model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
-        # Tokenize targets with text_target=...
-        labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-        model_inputs["labels"] = labels["input_ids"]
-        return model_inputs
-
-    # Validation preprocessing
-    def preprocess_validation_function(examples):
-        inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
-
-        model_inputs = tokenizer(
-            inputs,
-            max_length=max_seq_length,
-            padding=padding,
-            truncation=True,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-        )
-        # Tokenize targets with the `text_target` keyword argument
-        labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-        # Since one example might give us several features if it has a long context, we need a map from a feature to
-        # its corresponding example. This key gives us just that.
-        sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
-
-        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
-        # corresponding example_id and we will store the offset mappings.
-        model_inputs["example_id"] = []
-        # Augment the overflowing tokens to the labels
-        labels_out = []
-
-        for i in range(len(model_inputs["input_ids"])):
-            # One example can give several spans, this is the index of the example containing this span of text.
-            sample_index = sample_mapping[i]
-            model_inputs["example_id"].append(examples["id"][sample_index])
-            labels_out.append(labels["input_ids"][sample_index])
-
-        model_inputs["labels"] = labels_out
-        return model_inputs
-
-    if training_args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = raw_datasets["train"]
-        if data_args.max_train_samples is not None:
-            # We will select sample from whole data if argument is specified
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-        # Create train feature from dataset
-        with training_args.main_process_first(desc="train dataset map pre-processing"):
-            train_dataset = train_dataset.map(
-                preprocess_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on train dataset",
-            )
-        if data_args.max_train_samples is not None:
-            # Number of samples might increase during Feature Creation, We select only specified max samples
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_examples = raw_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            # We will select sample from whole data
-            max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
-            eval_examples = eval_examples.select(range(max_eval_samples))
-        # Validation Feature Creation
-        with training_args.main_process_first(desc="validation dataset map pre-processing"):
-            eval_dataset = eval_examples.map(
-                preprocess_validation_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on validation dataset",
-            )
-        if data_args.max_eval_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    if training_args.do_predict:
-        if "test" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_examples = raw_datasets["test"]
-        if data_args.max_predict_samples is not None:
-            # We will select sample from whole data
-            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
-        # Predict Feature Creation
-        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
-            predict_dataset = predict_examples.map(
-                preprocess_validation_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on prediction dataset",
-            )
-        if data_args.max_predict_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
-            predict_dataset = predict_dataset.select(range(max_predict_samples))
-
-    # Data collator
-    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
-    data_collator = DataCollatorForSeq2Seq(
-        tokenizer,
-        model=model,
-        label_pad_token_id=label_pad_token_id,
-        pad_to_multiple_of=8 if training_args.fp16 else None,
-    )
-
-    metric = evaluate.load(
-        "squad_v2" if data_args.version_2_with_negative else "squad", cache_dir=model_args.cache_dir
-    )
-
-    def compute_metrics(p: EvalPrediction):
-        return metric.compute(predictions=p.predictions, references=p.label_ids)
-
-    # Post-processing:
-    def post_processing_function(
-        examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval"
-    ):
-        # Decode the predicted tokens.
-        preds = outputs.predictions
-        if isinstance(preds, tuple):
-            preds = preds[0]
-        # Replace -100s used for padding as we can't decode them
-        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
-        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-
-        # Build a map example to its corresponding features.
-        example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
-        feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
-        predictions = {}
-        # Let's loop over all the examples!
-        for example_index, example in enumerate(examples):
-            # This is the index of the feature associated to the current example.
-            feature_index = feature_per_example[example_index]
-            predictions[example["id"]] = decoded_preds[feature_index]
-
-        # Format the result to the format the metric expects.
-        if data_args.version_2_with_negative:
-            formatted_predictions = [
-                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
-            ]
-        else:
-            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
-
-        references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples]
-        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
-
-    # Initialize our Trainer
-    trainer = QuestionAnsweringSeq2SeqTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        eval_examples=eval_examples if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-        post_process_function=post_processing_function,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    max_length = (
-        training_args.generation_max_length
-        if training_args.generation_max_length is not None
-        else data_args.val_max_answer_length
-    )
-    num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval")
-
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Prediction
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-        results = trainer.predict(predict_dataset, predict_examples)
-        metrics = results.metrics
-
-        max_predict_samples = (
-            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-    if training_args.push_to_hub:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
-        if data_args.dataset_name is not None:
-            kwargs["dataset_tags"] = data_args.dataset_name
-            if data_args.dataset_config_name is not None:
-                kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-            else:
-                kwargs["dataset"] = data_args.dataset_name
-
-        trainer.push_to_hub(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/question-answering/trainer_qa.py b/examples/question-answering/trainer_qa.py
deleted file mode 100644
index 3f684502a2..0000000000
--- a/examples/question-answering/trainer_qa.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A subclass of `GaudiTrainer` specific to Question-Answering tasks
-"""
-
-import math
-import time
-
-from transformers.trainer_utils import PredictionOutput, speed_metrics
-
-from optimum.habana import GaudiTrainer
-
-
-class QuestionAnsweringTrainer(GaudiTrainer):
-    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.eval_examples = eval_examples
-        self.post_process_function = post_process_function
-
-    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
-        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
-        eval_dataloader = self.get_eval_dataloader(eval_dataset)
-        eval_examples = self.eval_examples if eval_examples is None else eval_examples
-
-        # Temporarily disable metric computation, we will do it in the loop here.
-        compute_metrics = self.compute_metrics
-        self.compute_metrics = None
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        start_time = time.time()
-        try:
-            output = eval_loop(
-                eval_dataloader,
-                description="Evaluation",
-                # No point gathering the predictions if there are no metrics, otherwise we defer to
-                # self.args.prediction_loss_only
-                prediction_loss_only=True if compute_metrics is None else None,
-                ignore_keys=ignore_keys,
-                metric_key_prefix=metric_key_prefix,
-            )
-        finally:
-            self.compute_metrics = compute_metrics
-        total_batch_size = self.args.eval_batch_size * self.args.world_size
-        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
-            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
-        output.metrics.update(
-            speed_metrics(
-                metric_key_prefix,
-                start_time,
-                num_samples=output.num_samples,
-                num_steps=math.ceil(output.num_samples / total_batch_size),
-            )
-        )
-        if self.post_process_function is not None and self.compute_metrics is not None and self.args.should_save:
-            # Only the main node write the results by default
-            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
-            metrics = self.compute_metrics(eval_preds)
-
-            # Prefix all keys with metric_key_prefix + '_'
-            for key in list(metrics.keys()):
-                if not key.startswith(f"{metric_key_prefix}_"):
-                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-            metrics.update(output.metrics)
-        else:
-            metrics = output.metrics
-
-        if self.args.should_log:
-            # Only the main node log the results by default
-            self.log(metrics)
-
-        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
-        return metrics
-
-    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
-        predict_dataloader = self.get_test_dataloader(predict_dataset)
-
-        # Temporarily disable metric computation, we will do it in the loop here.
-        compute_metrics = self.compute_metrics
-        self.compute_metrics = None
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        start_time = time.time()
-        try:
-            output = eval_loop(
-                predict_dataloader,
-                description="Prediction",
-                # No point gathering the predictions if there are no metrics, otherwise we defer to
-                # self.args.prediction_loss_only
-                prediction_loss_only=True if compute_metrics is None else None,
-                ignore_keys=ignore_keys,
-                metric_key_prefix=metric_key_prefix,
-            )
-        finally:
-            self.compute_metrics = compute_metrics
-        total_batch_size = self.args.eval_batch_size * self.args.world_size
-        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
-            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
-        output.metrics.update(
-            speed_metrics(
-                metric_key_prefix,
-                start_time,
-                num_samples=output.num_samples,
-                num_steps=math.ceil(output.num_samples / total_batch_size),
-            )
-        )
-
-        if self.post_process_function is None or self.compute_metrics is None:
-            return output
-
-        if self.args.should_save:
-            predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
-            metrics = self.compute_metrics(predictions)
-
-            # Prefix all keys with metric_key_prefix + '_'
-            for key in list(metrics.keys()):
-                if not key.startswith(f"{metric_key_prefix}_"):
-                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-            metrics.update(output.metrics)
-            return PredictionOutput(
-                predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics
-            )
diff --git a/examples/question-answering/trainer_seq2seq_qa.py b/examples/question-answering/trainer_seq2seq_qa.py
deleted file mode 100644
index 7a8a31c420..0000000000
--- a/examples/question-answering/trainer_seq2seq_qa.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A subclass of `GaudiSeq2SeqTrainer` specific to Question-Answering tasks
-"""
-
-import math
-import time
-from typing import Dict, List, Optional
-
-from torch.utils.data import Dataset
-from transformers.trainer_utils import PredictionOutput, speed_metrics
-
-from optimum.habana import GaudiSeq2SeqTrainer
-
-
-class QuestionAnsweringSeq2SeqTrainer(GaudiSeq2SeqTrainer):
-    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.eval_examples = eval_examples
-        self.post_process_function = post_process_function
-
-    # def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
-    def evaluate(
-        self,
-        eval_dataset: Optional[Dataset] = None,
-        eval_examples=None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-        **gen_kwargs,
-    ) -> Dict[str, float]:
-        gen_kwargs = gen_kwargs.copy()
-
-        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
-        # training args
-        if gen_kwargs.get("max_length") is None and self.args.generation_max_length is not None:
-            gen_kwargs["max_length"] = self.args.generation_max_length
-        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
-            gen_kwargs["num_beams"] = self.args.generation_num_beams
-        self._gen_kwargs = gen_kwargs
-
-        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
-        eval_dataloader = self.get_eval_dataloader(eval_dataset)
-        eval_examples = self.eval_examples if eval_examples is None else eval_examples
-
-        # Temporarily disable metric computation, we will do it in the loop here.
-        compute_metrics = self.compute_metrics
-        self.compute_metrics = None
-        start_time = time.time()
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        try:
-            output = eval_loop(
-                eval_dataloader,
-                description="Evaluation",
-                # No point gathering the predictions if there are no metrics, otherwise we defer to
-                # self.args.prediction_loss_only
-                prediction_loss_only=True if compute_metrics is None else None,
-                ignore_keys=ignore_keys,
-                metric_key_prefix=metric_key_prefix,
-            )
-        finally:
-            self.compute_metrics = compute_metrics
-        total_batch_size = self.args.eval_batch_size * self.args.world_size
-        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
-            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
-        output.metrics.update(
-            speed_metrics(
-                metric_key_prefix,
-                start_time,
-                num_samples=output.num_samples,
-                num_steps=math.ceil(output.num_samples / total_batch_size),
-            )
-        )
-
-        if self.post_process_function is not None and self.compute_metrics is not None and self.args.should_save:
-            # Only the main node write the results by default
-            eval_preds = self.post_process_function(eval_examples, eval_dataset, output)
-            metrics = self.compute_metrics(eval_preds)
-
-            # Prefix all keys with metric_key_prefix + '_'
-            for key in list(metrics.keys()):
-                if not key.startswith(f"{metric_key_prefix}_"):
-                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-            metrics.update(output.metrics)
-        else:
-            metrics = output.metrics
-
-        if self.args.should_log:
-            # Only the main node log the results by default
-            self.log(metrics)
-
-        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
-        return metrics
-
-    def predict(
-        self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test", **gen_kwargs
-    ):
-        self._gen_kwargs = gen_kwargs.copy()
-
-        predict_dataloader = self.get_test_dataloader(predict_dataset)
-
-        # Temporarily disable metric computation, we will do it in the loop here.
-        compute_metrics = self.compute_metrics
-        self.compute_metrics = None
-        start_time = time.time()
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        try:
-            output = eval_loop(
-                predict_dataloader,
-                description="Prediction",
-                # No point gathering the predictions if there are no metrics, otherwise we defer to
-                # self.args.prediction_loss_only
-                prediction_loss_only=True if compute_metrics is None else None,
-                ignore_keys=ignore_keys,
-                metric_key_prefix=metric_key_prefix,
-            )
-        finally:
-            self.compute_metrics = compute_metrics
-
-        total_batch_size = self.args.eval_batch_size * self.args.world_size
-        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
-            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
-        output.metrics.update(
-            speed_metrics(
-                metric_key_prefix,
-                start_time,
-                num_samples=output.num_samples,
-                num_steps=math.ceil(output.num_samples / total_batch_size),
-            )
-        )
-        if self.post_process_function is None or self.compute_metrics is None:
-            return output
-
-        if self.args.should_save:
-            predictions = self.post_process_function(predict_examples, predict_dataset, output, "predict")
-            metrics = self.compute_metrics(predictions)
-
-            # Prefix all keys with metric_key_prefix + '_'
-            for key in list(metrics.keys()):
-                if not key.startswith(f"{metric_key_prefix}_"):
-                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-            metrics.update(output.metrics)
-            return PredictionOutput(
-                predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics
-            )
diff --git a/examples/question-answering/utils_qa.py b/examples/question-answering/utils_qa.py
deleted file mode 100644
index c596bf98ef..0000000000
--- a/examples/question-answering/utils_qa.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Post-processing utilities for question answering.
-"""
-
-import collections
-import json
-import logging
-import os
-from typing import Optional, Tuple
-
-import numpy as np
-from tqdm.auto import tqdm
-
-
-logger = logging.getLogger(__name__)
-
-
-def postprocess_qa_predictions(
-    examples,
-    features,
-    predictions: Tuple[np.ndarray, np.ndarray],
-    version_2_with_negative: bool = False,
-    n_best_size: int = 20,
-    max_answer_length: int = 30,
-    null_score_diff_threshold: float = 0.0,
-    output_dir: Optional[str] = None,
-    prefix: Optional[str] = None,
-    log_level: Optional[int] = logging.WARNING,
-):
-    """
-    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
-    original contexts. This is the base postprocessing functions for models that only return start and end logits.
-    Args:
-        examples: The non-preprocessed dataset (see the main script for more information).
-        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
-            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
-            first dimension must match the number of elements of :obj:`features`.
-        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the underlying dataset contains examples with no answers.
-        n_best_size (:obj:`int`, `optional`, defaults to 20):
-            The total number of n-best predictions to generate when looking for an answer.
-        max_answer_length (:obj:`int`, `optional`, defaults to 30):
-            The maximum length of an answer that can be generated. This is needed because the start and end predictions
-            are not conditioned on one another.
-        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
-            The threshold used to select the null answer: if the best answer has a score that is less than the score of
-            the null answer minus this threshold, the null answer is selected for this example (note that the score of
-            the null answer for an example giving several features is the minimum of the scores for the null answer on
-            each feature: all features must be aligned on the fact they `want` to predict a null answer).
-            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
-        output_dir (:obj:`str`, `optional`):
-            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
-            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
-            answers, are saved in `output_dir`.
-        prefix (:obj:`str`, `optional`):
-            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
-            ``logging`` log level (e.g., ``logging.WARNING``)
-    """
-    if len(predictions) != 2:
-        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
-    all_start_logits, all_end_logits = predictions
-
-    if len(predictions[0]) != len(features):
-        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
-
-    # Build a map example to its corresponding features.
-    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
-    features_per_example = collections.defaultdict(list)
-    for i, feature in enumerate(features):
-        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
-
-    # The dictionaries we have to fill.
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    if version_2_with_negative:
-        scores_diff_json = collections.OrderedDict()
-
-    # Logging.
-    logger.setLevel(log_level)
-    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
-
-    # Let's loop over all the examples!
-    for example_index, example in enumerate(tqdm(examples)):
-        # Those are the indices of the features associated to the current example.
-        feature_indices = features_per_example[example_index]
-
-        min_null_prediction = None
-        prelim_predictions = []
-
-        # Looping through all the features associated to the current example.
-        for feature_index in feature_indices:
-            # We grab the predictions of the model for this feature.
-            start_logits = all_start_logits[feature_index]
-            end_logits = all_end_logits[feature_index]
-            # This is what will allow us to map some the positions in our logits to span of texts in the original
-            # context.
-            offset_mapping = features[feature_index]["offset_mapping"]
-            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
-            # available in the current feature.
-            token_is_max_context = features[feature_index].get("token_is_max_context", None)
-
-            # Update minimum null prediction.
-            feature_null_score = start_logits[0] + end_logits[0]
-            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
-                min_null_prediction = {
-                    "offsets": (0, 0),
-                    "score": feature_null_score,
-                    "start_logit": start_logits[0],
-                    "end_logit": end_logits[0],
-                }
-
-            # Go through all possibilities for the `n_best_size` greater start and end logits.
-            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
-            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
-                    # to part of the input_ids that are not in the context.
-                    if (
-                        start_index >= len(offset_mapping)
-                        or end_index >= len(offset_mapping)
-                        or offset_mapping[start_index] is None
-                        or len(offset_mapping[start_index]) < 2
-                        or offset_mapping[end_index] is None
-                        or len(offset_mapping[end_index]) < 2
-                    ):
-                        continue
-                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
-                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
-                        continue
-                    # Don't consider answer that don't have the maximum context available (if such information is
-                    # provided).
-                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
-                        continue
-
-                    prelim_predictions.append(
-                        {
-                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
-                            "score": start_logits[start_index] + end_logits[end_index],
-                            "start_logit": start_logits[start_index],
-                            "end_logit": end_logits[end_index],
-                        }
-                    )
-        if version_2_with_negative and min_null_prediction is not None:
-            # Add the minimum null prediction
-            prelim_predictions.append(min_null_prediction)
-            null_score = min_null_prediction["score"]
-
-        # Only keep the best `n_best_size` predictions.
-        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
-
-        # Add back the minimum null prediction if it was removed because of its low score.
-        if (
-            version_2_with_negative
-            and min_null_prediction is not None
-            and not any(p["offsets"] == (0, 0) for p in predictions)
-        ):
-            predictions.append(min_null_prediction)
-
-        # Use the offsets to gather the answer text in the original context.
-        context = example["context"]
-        for pred in predictions:
-            offsets = pred.pop("offsets")
-            pred["text"] = context[offsets[0] : offsets[1]]
-
-        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
-        # failure.
-        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
-            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
-
-        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
-        # the LogSumExp trick).
-        scores = np.array([pred.pop("score") for pred in predictions])
-        exp_scores = np.exp(scores - np.max(scores))
-        probs = exp_scores / exp_scores.sum()
-
-        # Include the probabilities in our predictions.
-        for prob, pred in zip(probs, predictions):
-            pred["probability"] = prob
-
-        # Pick the best prediction. If the null answer is not possible, this is easy.
-        if not version_2_with_negative:
-            all_predictions[example["id"]] = predictions[0]["text"]
-        else:
-            # Otherwise we first need to find the best non-empty prediction.
-            i = 0
-            while predictions[i]["text"] == "":
-                i += 1
-            best_non_null_pred = predictions[i]
-
-            # Then we compare to the null prediction using the threshold.
-            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
-            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example["id"]] = ""
-            else:
-                all_predictions[example["id"]] = best_non_null_pred["text"]
-
-        # Make `predictions` JSON-serializable by casting np.float back to float.
-        all_nbest_json[example["id"]] = [
-            {k: float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v for k, v in pred.items()}
-            for pred in predictions
-        ]
-
-    # If we have an output_dir, let's save all those dicts.
-    if output_dir is not None:
-        if not os.path.isdir(output_dir):
-            raise EnvironmentError(f"{output_dir} is not a directory.")
-
-        prediction_file = os.path.join(
-            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
-        )
-        nbest_file = os.path.join(
-            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
-        )
-        if version_2_with_negative:
-            null_odds_file = os.path.join(
-                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
-            )
-
-        logger.info(f"Saving predictions to {prediction_file}.")
-        with open(prediction_file, "w") as writer:
-            writer.write(json.dumps(all_predictions, indent=4) + "\n")
-        logger.info(f"Saving nbest_preds to {nbest_file}.")
-        with open(nbest_file, "w") as writer:
-            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-        if version_2_with_negative:
-            logger.info(f"Saving null_odds to {null_odds_file}.")
-            with open(null_odds_file, "w") as writer:
-                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions
-
-
-def postprocess_qa_predictions_with_beam_search(
-    examples,
-    features,
-    predictions: Tuple[np.ndarray, np.ndarray],
-    version_2_with_negative: bool = False,
-    n_best_size: int = 20,
-    max_answer_length: int = 30,
-    start_n_top: int = 5,
-    end_n_top: int = 5,
-    output_dir: Optional[str] = None,
-    prefix: Optional[str] = None,
-    log_level: Optional[int] = logging.WARNING,
-):
-    """
-    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
-    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
-    cls token predictions.
-    Args:
-        examples: The non-preprocessed dataset (see the main script for more information).
-        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
-            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
-            first dimension must match the number of elements of :obj:`features`.
-        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the underlying dataset contains examples with no answers.
-        n_best_size (:obj:`int`, `optional`, defaults to 20):
-            The total number of n-best predictions to generate when looking for an answer.
-        max_answer_length (:obj:`int`, `optional`, defaults to 30):
-            The maximum length of an answer that can be generated. This is needed because the start and end predictions
-            are not conditioned on one another.
-        start_n_top (:obj:`int`, `optional`, defaults to 5):
-            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
-        end_n_top (:obj:`int`, `optional`, defaults to 5):
-            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
-        output_dir (:obj:`str`, `optional`):
-            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
-            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
-            answers, are saved in `output_dir`.
-        prefix (:obj:`str`, `optional`):
-            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
-            ``logging`` log level (e.g., ``logging.WARNING``)
-    """
-    if len(predictions) != 5:
-        raise ValueError("`predictions` should be a tuple with five elements.")
-    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
-
-    if len(predictions[0]) != len(features):
-        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
-
-    # Build a map example to its corresponding features.
-    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
-    features_per_example = collections.defaultdict(list)
-    for i, feature in enumerate(features):
-        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
-
-    # The dictionaries we have to fill.
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
-
-    # Logging.
-    logger.setLevel(log_level)
-    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
-
-    # Let's loop over all the examples!
-    for example_index, example in enumerate(tqdm(examples)):
-        # Those are the indices of the features associated to the current example.
-        feature_indices = features_per_example[example_index]
-
-        min_null_score = None
-        prelim_predictions = []
-
-        # Looping through all the features associated to the current example.
-        for feature_index in feature_indices:
-            # We grab the predictions of the model for this feature.
-            start_log_prob = start_top_log_probs[feature_index]
-            start_indexes = start_top_index[feature_index]
-            end_log_prob = end_top_log_probs[feature_index]
-            end_indexes = end_top_index[feature_index]
-            feature_null_score = cls_logits[feature_index]
-            # This is what will allow us to map some the positions in our logits to span of texts in the original
-            # context.
-            offset_mapping = features[feature_index]["offset_mapping"]
-            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
-            # available in the current feature.
-            token_is_max_context = features[feature_index].get("token_is_max_context", None)
-
-            # Update minimum null prediction
-            if min_null_score is None or feature_null_score < min_null_score:
-                min_null_score = feature_null_score
-
-            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
-            for i in range(start_n_top):
-                for j in range(end_n_top):
-                    start_index = int(start_indexes[i])
-                    j_index = i * end_n_top + j
-                    end_index = int(end_indexes[j_index])
-                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
-                    # p_mask but let's not take any risk)
-                    if (
-                        start_index >= len(offset_mapping)
-                        or end_index >= len(offset_mapping)
-                        or offset_mapping[start_index] is None
-                        or len(offset_mapping[start_index]) < 2
-                        or offset_mapping[end_index] is None
-                        or len(offset_mapping[end_index]) < 2
-                    ):
-                        continue
-
-                    # Don't consider answers with a length negative or > max_answer_length.
-                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
-                        continue
-                    # Don't consider answer that don't have the maximum context available (if such information is
-                    # provided).
-                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
-                        continue
-                    prelim_predictions.append(
-                        {
-                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
-                            "score": start_log_prob[i] + end_log_prob[j_index],
-                            "start_log_prob": start_log_prob[i],
-                            "end_log_prob": end_log_prob[j_index],
-                        }
-                    )
-
-        # Only keep the best `n_best_size` predictions.
-        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
-
-        # Use the offsets to gather the answer text in the original context.
-        context = example["context"]
-        for pred in predictions:
-            offsets = pred.pop("offsets")
-            pred["text"] = context[offsets[0] : offsets[1]]
-
-        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
-        # failure.
-        if len(predictions) == 0:
-            # Without predictions min_null_score is going to be None and None will cause an exception later
-            min_null_score = -2e-6
-            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": min_null_score})
-
-        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
-        # the LogSumExp trick).
-        scores = np.array([pred.pop("score") for pred in predictions])
-        exp_scores = np.exp(scores - np.max(scores))
-        probs = exp_scores / exp_scores.sum()
-
-        # Include the probabilities in our predictions.
-        for prob, pred in zip(probs, predictions):
-            pred["probability"] = prob
-
-        # Pick the best prediction and set the probability for the null answer.
-        all_predictions[example["id"]] = predictions[0]["text"]
-        if version_2_with_negative:
-            scores_diff_json[example["id"]] = float(min_null_score)
-
-        # Make `predictions` JSON-serializable by casting np.float back to float.
-        all_nbest_json[example["id"]] = [
-            {k: float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v for k, v in pred.items()}
-            for pred in predictions
-        ]
-
-    # If we have an output_dir, let's save all those dicts.
-    if output_dir is not None:
-        if not os.path.isdir(output_dir):
-            raise EnvironmentError(f"{output_dir} is not a directory.")
-
-        prediction_file = os.path.join(
-            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
-        )
-        nbest_file = os.path.join(
-            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
-        )
-        if version_2_with_negative:
-            null_odds_file = os.path.join(
-                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
-            )
-
-        logger.info(f"Saving predictions to {prediction_file}.")
-        with open(prediction_file, "w") as writer:
-            writer.write(json.dumps(all_predictions, indent=4) + "\n")
-        logger.info(f"Saving nbest_preds to {nbest_file}.")
-        with open(nbest_file, "w") as writer:
-            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-        if version_2_with_negative:
-            logger.info(f"Saving null_odds to {null_odds_file}.")
-            with open(null_odds_file, "w") as writer:
-                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions, scores_diff_json
diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md
deleted file mode 100644
index 92e576be3f..0000000000
--- a/examples/speech-recognition/README.md
+++ /dev/null
@@ -1,329 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Automatic Speech Recognition Examples
-
-## Table of Contents
-
-- [Automatic Speech Recognition with CTC](#connectionist-temporal-classification)
-	- [Single HPU example](#single-hpu-ctc)
-	- [Multi HPU example](#multi-hpu-ctc)
-- [Automatic Speech Recognition with Sequence-to-Sequence](#sequence-to-sequence)
-	- [Whisper Model](#whisper-model)
-	- [Fine tuning](#single-hpu-whisper-fine-tuning-with-seq2seq)
-	- [Inference](#single-hpu-seq2seq-inference)
-
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Connectionist Temporal Classification
-
-The script [`run_speech_recognition_ctc.py`](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition/run_speech_recognition_ctc.py) can be used to fine-tune any pretrained [Connectionist Temporal Classification Model](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCTC) for automatic speech recognition on one of the [official speech recognition datasets](https://huggingface.co/datasets?task_ids=task_ids:automatic-speech-recognition) or a custom dataset.
-
-Speech recognition models that have been pretrained in an unsupervised fashion on audio data alone, *e.g.* [Wav2Vec2](https://huggingface.co/transformers/main/model_doc/wav2vec2.html), have shown to require only very little annotated data to yield good performance on automatic speech recognition datasets.
-
-In the script [`run_speech_recognition_ctc`](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition/run_speech_recognition_ctc.py), we first create a vocabulary from all unique characters of both the training data and evaluation data. Then, we preprocess the speech recognition dataset, which includes correct resampling, normalization and padding. Finally, the pretrained speech recognition model is fine-tuned on the annotated speech recognition datasets using CTC loss.
-
-<!-- ---
-**NOTE**
-
-If you encounter problems with data preprocessing by setting `--preprocessing_num_workers` > 1,
-you might want to set the environment variable `OMP_NUM_THREADS` to 1 as follows:
-
-```bash
-OMP_NUM_THREADS=1 python run_speech_recognition_ctc ...
-```
-
-If the environment variable is not set, the training script might freeze, *i.e.* see: https://github.com/pytorch/audio/issues/1021#issuecomment-726915239
-
---- -->
-
-### Single-HPU CTC
-
-The following command shows how to fine-tune [wav2vec2-large-lv60](https://huggingface.co/facebook/wav2vec2-large-lv60) on [Librispeech](https://huggingface.co/datasets/librispeech_asr) using a single HPU.
-
-```bash
-python run_speech_recognition_ctc.py \
-    --dataset_name="librispeech_asr" \
-    --model_name_or_path="facebook/wav2vec2-large-lv60" \
-    --dataset_config_name="clean" \
-    --train_split_name="train.100" \
-    --eval_split_name="validation" \
-    --output_dir="/tmp/wav2vec2-librispeech-clean-100h-demo-dist" \
-    --preprocessing_num_workers="64" \
-    --dataloader_num_workers 8 \
-    --overwrite_output_dir \
-    --num_train_epochs="3" \
-    --per_device_train_batch_size="4" \
-    --learning_rate="3e-4" \
-    --warmup_steps="500" \
-    --text_column_name="text" \
-    --layerdrop="0.0" \
-    --freeze_feature_encoder \
-    --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --gaudi_config_name="Habana/wav2vec2" \
-    --throughput_warmup_steps="3" \
-    --bf16 \
-    --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference
-```
-
-On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**.
-
-> If your data has a sampling rate which is different from the one of the data the model was trained on, this script will raise an error.
-> Resampling with the `datasets` library is not supported on HPUs yet. HPU graphs are supported only on Gaudi2 and from SynapseAI v1.15.
-
-### Multi-HPU CTC
-
-The following command shows how to fine-tune [wav2vec2-large-lv60](https://huggingface.co/facebook/wav2vec2-large-lv60) on [Librispeech](https://huggingface.co/datasets/librispeech_asr) using 8 HPUs.
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_speech_recognition_ctc.py \
-    --dataset_name librispeech_asr \
-    --model_name_or_path facebook/wav2vec2-large-lv60 \
-    --dataset_config_name clean \
-    --train_split_name train.100 \
-    --eval_split_name validation \
-    --output_dir /tmp/wav2vec2-librispeech-clean-100h-demo-dist \
-    --preprocessing_num_workers 64 \
-    --dataloader_num_workers 8 \
-    --overwrite_output_dir \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-4 \
-    --warmup_steps 500 \
-    --text_column_name text \
-    --layerdrop 0.0 \
-    --freeze_feature_encoder \
-    --chars_to_ignore '",?.!-;:\"“%‘”"' \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --gaudi_config_name Habana/wav2vec2 \
-    --throughput_warmup_steps 3 \
-    --bf16 \
-    --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference
-```
-
-On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**.
-
-> If your data has a sampling rate which is different from the one of the data the model was trained on, this script will raise an error.
-> Resampling with the `datasets` library is not supported on HPUs yet. HPU graphs are supported only on Gaudi2 and from SynapseAI v1.15.
-
-
-## DeepSpeed
-
-> You need to install DeepSpeed with:
-> ```bash
-> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-> ```
-
-DeepSpeed can be used with almost the same command as for a multi-card run:
-- `use_mpi` should be replaced by `use_deepspeed`,
-- an additional `--deepspeed path_to_my_deepspeed config` argument should be provided, for instance `--deepspeed ../../tests/configs/deepspeed_zero_2.json`.
-
-For example:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_speech_recognition_ctc.py \
-    --dataset_name librispeech_asr \
-    --model_name_or_path facebook/wav2vec2-large-lv60 \
-    --dataset_config_name clean \
-    --train_split_name train.100 \
-    --eval_split_name validation \
-    --output_dir /tmp/wav2vec2-librispeech-clean-100h-demo-dist \
-    --preprocessing_num_workers 64 \
-    --dataloader_num_workers 8 \
-    --overwrite_output_dir \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-4 \
-    --warmup_steps 500 \
-    --text_column_name text \
-    --layerdrop 0.0 \
-    --freeze_feature_encoder \
-    --chars_to_ignore '",?.!-;:\"“%‘”"' \
-    --do_train \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --gaudi_config_name Habana/wav2vec2 \
-    --throughput_warmup_steps 3 \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json
-```
-
-[The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
-
-> If your data has a sampling rate which is different from the one of the data the model was trained on, this script will raise an error.
-> Resampling with the `datasets` library is not supported on HPUs yet.
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with Wav2Vec2 on the Librispeech dataset on 1 Gaudi card with the following command:
-```bash
-python run_speech_recognition_ctc.py \
-    --dataset_name="librispeech_asr" \
-    --model_name_or_path="facebook/wav2vec2-large-lv60" \
-    --dataset_config_name="clean" \
-    --train_split_name="train.100" \
-    --eval_split_name="validation" \
-    --output_dir="/tmp/wav2vec2-librispeech-clean-100h-demo-dist" \
-    --preprocessing_num_workers="64" \
-    --dataloader_num_workers 8 \
-    --overwrite_output_dir \
-    --text_column_name="text" \
-    --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” \
-    --do_eval \
-    --use_habana \
-    --use_lazy_mode \
-    --gaudi_config_name="Habana/wav2vec2" \
-    --bf16 \
-    --use_hpu_graphs_for_inference
-```
-## Sequence to Sequence
-
-The script [`run_speech_recognition_seq2seq.py`](https://github.com/huggingface/optimum-habana/examples/speech-recognition/run_speech_recognition_seq2seq.py) can be used to fine-tune any [Whisper Sequence-to-Sequence Model](https://huggingface.co/docs/transformers/main/en/model_doc/whisper#whisper) for automatic speech
-recognition on one of the well known speech recognition datasets similar to shown below or a custom dataset. Examples of two datasets using the Whisper model from OpenAI are included below.
-
-### Whisper Model
-We can load all components of the Whisper model directly from the pretrained checkpoint, including the pretrained model weights, feature extractor and tokenizer. We simply have to specify our fine-tuning dataset and training hyperparameters.
-
-### Single HPU Whisper Fine tuning with Seq2Seq
-The following example shows how to fine-tune the [Whisper small](https://huggingface.co/openai/whisper-small) checkpoint on the Hindi subset of [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) using a single HPU device in bf16 precision:
-```bash
-python run_speech_recognition_seq2seq.py \
-    --model_name_or_path="openai/whisper-small" \
-    --dataset_name="mozilla-foundation/common_voice_11_0" \
-    --dataset_config_name="hi" \
-    --language="hindi" \
-    --task="transcribe" \
-    --train_split_name="train+validation" \
-    --eval_split_name="test" \
-    --gaudi_config_name="Habana/whisper" \
-    --max_steps="5000" \
-    --output_dir="/tmp/whisper-small-hi" \
-    --per_device_train_batch_size="48" \
-    --per_device_eval_batch_size="2" \
-    --logging_steps="25" \
-    --learning_rate="1e-5" \
-    --warmup_steps="500" \
-    --evaluation_strategy="steps" \
-    --eval_steps="1000" \
-    --save_strategy="steps" \
-    --save_steps="1000" \
-    --generation_max_length="225" \
-    --preprocessing_num_workers="1" \
-    --max_duration_in_seconds="30" \
-    --text_column_name="sentence" \
-    --freeze_feature_encoder="False" \
-    --bf16 \
-    --overwrite_output_dir \
-    --do_train \
-    --do_eval \
-    --predict_with_generate \
-    --use_habana \
-    --use_hpu_graphs_for_inference \
-    --label_features_max_length 128 \
-    --dataloader_num_workers 8 \
-    --throughput_warmup_steps 3
-```
-
-If training on a different language, you should be sure to change the `language` argument. The `language` and `task` arguments should be omitted for English speech recognition.
-
-
-### Multi HPU Whisper Training with Seq2Seq
-The following example shows how to fine-tune the [Whisper large](https://huggingface.co/openai/whisper-large) checkpoint on the Hindi subset of [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) using 8 HPU devices in half-precision:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_speech_recognition_seq2seq.py \
-    --model_name_or_path="openai/whisper-large" \
-    --dataset_name="mozilla-foundation/common_voice_11_0" \
-    --dataset_config_name="hi" \
-    --language="hindi" \
-    --task="transcribe" \
-    --train_split_name="train+validation" \
-    --eval_split_name="test" \
-    --gaudi_config_name="Habana/whisper" \
-    --max_steps="625" \
-    --output_dir="/tmp/whisper-large-hi" \
-    --per_device_train_batch_size="16" \
-    --per_device_eval_batch_size="2" \
-    --logging_steps="25" \
-    --learning_rate="1e-5" \
-    --generation_max_length="225" \
-    --preprocessing_num_workers="1" \
-    --max_duration_in_seconds="30" \
-    --text_column_name="sentence" \
-    --freeze_feature_encoder="False" \
-    --bf16 \
-    --overwrite_output_dir \
-    --do_train \
-    --do_eval \
-    --predict_with_generate \
-    --use_habana \
-    --use_hpu_graphs_for_inference \
-    --label_features_max_length 128 \
-    --dataloader_num_workers 8 \
-    --gradient_checkpointing \
-    --throughput_warmup_steps 3
-```
-
-#### Single HPU Seq2Seq Inference
-
-The following example shows how to do inference with the [Whisper small](https://huggingface.co/openai/whisper-small) checkpoint on the Hindi subset of [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) using 1 HPU devices in half-precision:
-
-```bash
-python run_speech_recognition_seq2seq.py \
-    --model_name_or_path="openai/whisper-small" \
-    --dataset_name="mozilla-foundation/common_voice_11_0" \
-    --dataset_config_name="hi" \
-    --language="hindi" \
-    --task="transcribe" \
-    --eval_split_name="test" \
-    --gaudi_config_name="Habana/whisper" \
-    --output_dir="./results/whisper-small-clean" \
-    --per_device_eval_batch_size="32" \
-    --generation_max_length="225" \
-    --preprocessing_num_workers="1" \
-    --max_duration_in_seconds="30" \
-    --text_column_name="sentence" \
-    --freeze_feature_encoder="False" \
-    --bf16 \
-    --overwrite_output_dir \
-    --do_eval \
-    --predict_with_generate \
-    --use_habana \
-    --use_hpu_graphs_for_inference \
-    --label_features_max_length 128 \
-    --dataloader_num_workers 8
-```
diff --git a/examples/speech-recognition/requirements.txt b/examples/speech-recognition/requirements.txt
deleted file mode 100644
index 3d8644bd89..0000000000
--- a/examples/speech-recognition/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-datasets >= 1.18.0, <= 2.19.2
-librosa == 0.10.2.post1
-jiwer == 3.0.4
-evaluate == 0.4.2
diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py
deleted file mode 100644
index 048da1dd5d..0000000000
--- a/examples/speech-recognition/run_speech_recognition_ctc.py
+++ /dev/null
@@ -1,842 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
-
-import functools
-import json
-import logging
-import os
-import re
-import sys
-import warnings
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Union
-
-import datasets
-import evaluate
-import torch
-import transformers
-from datasets import DatasetDict, load_dataset
-from transformers import (
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoModelForCTC,
-    AutoProcessor,
-    AutoTokenizer,
-    HfArgumentParser,
-    Wav2Vec2Processor,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
-
-
-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    tokenizer_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_feature_encoder: bool = field(
-        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
-    )
-    attention_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
-    )
-    activation_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
-    )
-    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
-    hidden_dropout: float = field(
-        default=0.0,
-        metadata={
-            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
-        },
-    )
-    final_dropout: float = field(
-        default=0.0,
-        metadata={"help": "The dropout probability for the final projection layer."},
-    )
-    mask_time_prob: float = field(
-        default=0.05,
-        metadata={
-            "help": (
-                "Probability of each feature vector along the time axis to be chosen as the start of the vector "
-                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
-                "vectors will be masked along the time axis."
-            )
-        },
-    )
-    mask_time_length: int = field(
-        default=10,
-        metadata={"help": "Length of vector span to mask along the time axis."},
-    )
-    mask_feature_prob: float = field(
-        default=0.0,
-        metadata={
-            "help": (
-                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
-                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
-                " bins will be masked along the time axis."
-            )
-        },
-    )
-    mask_feature_length: int = field(
-        default=10,
-        metadata={"help": "Length of vector span to mask along the feature axis."},
-    )
-    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
-    ctc_loss_reduction: Optional[str] = field(
-        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
-    )
-    ctc_zero_infinity: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly"
-            " occur when the inputs are too short to be aligned to the targets."
-        },
-    )
-    add_adapter: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very "
-            "useful to downsample the output length."
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: str = field(
-        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: str = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_split_name: str = field(
-        default="train+validation",
-        metadata={
-            "help": (
-                "The name of the training data set split to use (via the datasets library). Defaults to "
-                "'train+validation'"
-            )
-        },
-    )
-    eval_split_name: str = field(
-        default="test",
-        metadata={
-            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
-        },
-    )
-    audio_column_name: str = field(
-        default="audio",
-        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
-    )
-    text_column_name: str = field(
-        default="text",
-        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of validation examples to this "
-                "value if set."
-            )
-        },
-    )
-    chars_to_ignore: Optional[List[str]] = list_field(
-        default=None,
-        metadata={"help": "A list of characters to remove from the transcripts."},
-    )
-    eval_metrics: List[str] = list_field(
-        default=["wer"],
-        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
-    )
-    max_duration_in_seconds: float = field(
-        default=20.0,
-        metadata={
-            "help": (
-                "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
-                " 'max_duration_in_seconds`"
-            )
-        },
-    )
-    min_duration_in_seconds: float = field(
-        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
-    )
-    preprocessing_only: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to only do data preprocessing and skip training. This is especially useful when data"
-                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
-                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
-                " can consequently be loaded in distributed training"
-            )
-        },
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    unk_token: str = field(
-        default="[UNK]",
-        metadata={"help": "The unk token for the tokenizer"},
-    )
-    pad_token: str = field(
-        default="[PAD]",
-        metadata={"help": "The padding token for the tokenizer"},
-    )
-    word_delimiter_token: str = field(
-        default="|",
-        metadata={"help": "The word delimiter token for the tokenizer"},
-    )
-    phoneme_language: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The target language that should be used be"
-                " passed to the tokenizer for tokenization. Note that"
-                " this is only relevant if the model classifies the"
-                " input audio to a sequence of phoneme sequences."
-            )
-        },
-    )
-
-
-@dataclass
-class DataCollatorCTCWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received.
-    Args:
-        processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (:obj:`int`, `optional`):
-            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
-        max_length_labels (:obj:`int`, `optional`):
-            Maximum length of the ``labels`` returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    processor: AutoProcessor
-    padding: Union[bool, str] = "longest"
-    pad_to_multiple_of: Optional[int] = None
-    pad_to_multiple_of_labels: Optional[int] = None
-    feature_extractor_input_name: Optional[str] = "input_values"
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-        input_features = [
-            {self.feature_extractor_input_name: feature[self.feature_extractor_input_name]} for feature in features
-        ]
-        label_features = [{"input_ids": feature["labels"]} for feature in features]
-
-        batch = self.processor.pad(
-            input_features,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt",
-        )
-
-        labels_batch = self.processor.pad(
-            labels=label_features,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of_labels,
-            return_tensors="pt",
-        )
-
-        # replace padding with -100 to ignore loss correctly
-        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-        batch["labels"] = labels
-        if "attention_mask" in batch:
-            batch["attention_mask"] = batch["attention_mask"].to(torch.long)
-
-        return batch
-
-
-def create_vocabulary_from_data(
-    datasets: DatasetDict,
-    word_delimiter_token: Optional[str] = None,
-    unk_token: Optional[str] = None,
-    pad_token: Optional[str] = None,
-):
-    # Given training and test labels create vocabulary
-    def extract_all_chars(batch):
-        all_text = " ".join(batch["target_text"])
-        vocab = list(set(all_text))
-        return {"vocab": [vocab], "all_text": [all_text]}
-
-    vocabs = datasets.map(
-        extract_all_chars,
-        batched=True,
-        batch_size=-1,
-        keep_in_memory=True,
-        remove_columns=datasets["train"].column_names,
-    )
-
-    # take union of all unique characters in each dataset
-    vocab_set = functools.reduce(
-        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
-    )
-
-    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
-
-    # replace white space with delimiter token
-    if word_delimiter_token is not None:
-        vocab_dict[word_delimiter_token] = vocab_dict[" "]
-        del vocab_dict[" "]
-
-    # add unk and pad token
-    if unk_token is not None:
-        vocab_dict[unk_token] = len(vocab_dict)
-
-    if pad_token is not None:
-        vocab_dict[pad_token] = len(vocab_dict)
-
-    return vocab_dict
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        token=data_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # 1. First, let's load the dataset
-    raw_datasets = DatasetDict()
-
-    raw_datasets["train"] = load_dataset(
-        data_args.dataset_name,
-        data_args.dataset_config_name,
-        split=data_args.train_split_name,
-        token=data_args.token,
-    )
-
-    if data_args.audio_column_name not in raw_datasets["train"].column_names:
-        raise ValueError(
-            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
-            " Make sure to set `--audio_column_name` to the correct audio column - one of"
-            f" {', '.join(raw_datasets['train'].column_names)}."
-        )
-
-    if data_args.text_column_name not in raw_datasets["train"].column_names:
-        raise ValueError(
-            f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--text_column_name` to the correct text column - one of "
-            f"{', '.join(raw_datasets['train'].column_names)}."
-        )
-
-    if data_args.max_train_samples is not None:
-        raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        raw_datasets["eval"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=data_args.eval_split_name,
-            token=data_args.token,
-        )
-
-        if data_args.max_eval_samples is not None:
-            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
-
-    # 2. We remove some special characters from the datasets
-    # that make training complicated and do not help in transcribing the speech
-    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
-    # that could be easily picked up by the model
-    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore).replace(" ", "")}]' if data_args.chars_to_ignore is not None else None
-    )
-    text_column_name = data_args.text_column_name
-
-    def remove_special_characters(batch):
-        if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
-        else:
-            batch["target_text"] = batch[text_column_name].lower() + " "
-        return batch
-
-    with training_args.main_process_first(desc="dataset map special characters removal"):
-        raw_datasets = raw_datasets.map(
-            remove_special_characters,
-            remove_columns=[text_column_name],
-            desc="remove special characters from datasets",
-        )
-
-    # save special tokens for tokenizer
-    word_delimiter_token = data_args.word_delimiter_token
-    unk_token = data_args.unk_token
-    pad_token = data_args.pad_token
-
-    # 3. Next, let's load the config as we might need it to create
-    # the tokenizer
-    # load config
-    config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        token=data_args.token,
-        trust_remote_code=data_args.trust_remote_code,
-    )
-
-    # 4. Next, if no tokenizer file is defined,
-    # we create the vocabulary of the model by extracting all unique characters from
-    # the training and evaluation datasets
-    # We need to make sure that only first rank saves vocabulary
-    # make sure all processes wait until vocab is created
-    tokenizer_name_or_path = model_args.tokenizer_name_or_path
-    tokenizer_kwargs = {}
-    if tokenizer_name_or_path is None:
-        # save vocab in training output dir
-        tokenizer_name_or_path = training_args.output_dir
-
-        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
-
-        with training_args.main_process_first():
-            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
-                try:
-                    os.remove(vocab_file)
-                except OSError:
-                    # in shared file-systems it might be the case that
-                    # two processes try to delete the vocab file at the some time
-                    pass
-
-        with training_args.main_process_first(desc="dataset map vocabulary creation"):
-            if not os.path.isfile(vocab_file):
-                os.makedirs(tokenizer_name_or_path, exist_ok=True)
-                vocab_dict = create_vocabulary_from_data(
-                    raw_datasets,
-                    word_delimiter_token=word_delimiter_token,
-                    unk_token=unk_token,
-                    pad_token=pad_token,
-                )
-
-                # save vocab dict to be loaded into tokenizer
-                with open(vocab_file, "w") as file:
-                    json.dump(vocab_dict, file)
-
-        # if tokenizer has just been created
-        # it is defined by `tokenizer_class` if present in config else by `model_type`
-        tokenizer_kwargs = {
-            "config": config if config.tokenizer_class is not None else None,
-            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
-            "unk_token": unk_token,
-            "pad_token": pad_token,
-            "word_delimiter_token": word_delimiter_token,
-        }
-
-    # 5. Now we can instantiate the feature extractor, tokenizer and model
-    # Note for distributed training, the .from_pretrained methods guarantee that only
-    # one local process can concurrently download model & vocab.
-
-    # load feature_extractor and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_name_or_path,
-        token=data_args.token,
-        trust_remote_code=data_args.trust_remote_code,
-        **tokenizer_kwargs,
-    )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        token=data_args.token,
-        trust_remote_code=data_args.trust_remote_code,
-    )
-
-    # adapt config
-    config.update(
-        {
-            "feat_proj_dropout": model_args.feat_proj_dropout,
-            "attention_dropout": model_args.attention_dropout,
-            "hidden_dropout": model_args.hidden_dropout,
-            "final_dropout": model_args.final_dropout,
-            "mask_time_prob": model_args.mask_time_prob,
-            "mask_time_length": model_args.mask_time_length,
-            "mask_feature_prob": model_args.mask_feature_prob,
-            "mask_feature_length": model_args.mask_feature_length,
-            "gradient_checkpointing": training_args.gradient_checkpointing,
-            "layerdrop": model_args.layerdrop,
-            "ctc_loss_reduction": model_args.ctc_loss_reduction,
-            "ctc_zero_infinity": model_args.ctc_zero_infinity,
-            "pad_token_id": tokenizer.pad_token_id,
-            "vocab_size": len(tokenizer),
-            "activation_dropout": model_args.activation_dropout,
-            "add_adapter": model_args.add_adapter,
-        }
-    )
-
-    # create model
-    model = AutoModelForCTC.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        config=config,
-        token=data_args.token,
-        trust_remote_code=data_args.trust_remote_code,
-    )
-
-    # freeze encoder
-    if model_args.freeze_feature_encoder:
-        model.freeze_feature_encoder()
-
-    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
-    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
-    # so that we just need to set the correct target sampling rate and normalize the input
-    # via the `feature_extractor`
-
-    # make sure that dataset decodes audio with correct sampling rate
-    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
-    if dataset_sampling_rate != feature_extractor.sampling_rate:
-        raise RuntimeError(
-            f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one"
-            f" ({feature_extractor.sampling_rate}).Data resampling should be done. The Datasets library does not"
-            " support it on HPUs yet."
-        )
-        raw_datasets = raw_datasets.cast_column(
-            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
-        )
-
-    # derive max & min input length for sample rate & max duration
-    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
-    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
-    audio_column_name = data_args.audio_column_name
-    num_workers = data_args.preprocessing_num_workers
-    feature_extractor_input_name = feature_extractor.model_input_names[0]
-
-    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
-    phoneme_language = data_args.phoneme_language
-
-    # Preprocessing the datasets.
-    # We need to read the audio files as arrays and tokenize the targets.
-    def prepare_dataset(batch):
-        # load audio
-        sample = batch[audio_column_name]
-
-        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
-        batch[feature_extractor_input_name] = getattr(inputs, feature_extractor_input_name)[0]
-        # take length of raw audio waveform
-        batch["input_length"] = len(sample["array"].squeeze())
-
-        # encode targets
-        additional_kwargs = {}
-        if phoneme_language is not None:
-            additional_kwargs["phonemizer_lang"] = phoneme_language
-
-        batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
-        return batch
-
-    with training_args.main_process_first(desc="dataset map preprocessing"):
-        vectorized_datasets = raw_datasets.map(
-            prepare_dataset,
-            remove_columns=next(iter(raw_datasets.values())).column_names,
-            num_proc=num_workers,
-            desc="preprocess datasets",
-        )
-
-        def is_audio_in_length_range(length):
-            return length > min_input_length and length < max_input_length
-
-        # filter data that is shorter than min_input_length
-        vectorized_datasets = vectorized_datasets.filter(
-            is_audio_in_length_range,
-            num_proc=num_workers,
-            input_columns=["input_length"],
-        )
-
-    # 7. Next, we can prepare the training.
-    # Let's use word error rate (WER) as our evaluation metric,
-    # instantiate a data collator and the trainer
-
-    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
-    eval_metrics = {metric: evaluate.load(metric, cache_dir=model_args.cache_dir) for metric in data_args.eval_metrics}
-
-    # for large datasets it is advised to run the preprocessing on a
-    # single machine first with ``args.preprocessing_only`` since there will mostly likely
-    # be a timeout when running the script in distributed mode.
-    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
-    # cached dataset
-    if data_args.preprocessing_only:
-        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
-        return
-
-    # For languages like Chinese with large vocabulary size, we need to discard logits
-    # and only keep the argmax, otherwise we run out of memory during evaluation.
-    def preprocess_logits_for_metrics(logits, labels):
-        pred_ids = torch.argmax(logits, dim=-1)
-        return pred_ids, labels
-
-    def compute_metrics(pred):
-        pred_ids = pred.predictions[0]
-        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
-
-        pred_str = tokenizer.batch_decode(pred_ids)
-        # we do not want to group tokens when computing the metrics
-        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
-
-        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
-
-        return metrics
-
-    # Now save everything to be able to create a single processor later
-    # make sure all processes wait until data is saved
-    with training_args.main_process_first():
-        # only the main process saves them
-        if is_main_process(training_args.local_rank):
-            # save feature extractor, tokenizer and config
-            feature_extractor.save_pretrained(training_args.output_dir)
-            tokenizer.save_pretrained(training_args.output_dir)
-            config.save_pretrained(training_args.output_dir)
-
-    try:
-        processor = AutoProcessor.from_pretrained(training_args.output_dir)
-    except (OSError, KeyError):
-        warnings.warn(
-            "Loading a processor from a feature extractor config that does not"
-            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
-            " attribute to your `preprocessor_config.json` file to suppress this warning: "
-            " `'processor_class': 'Wav2Vec2Processor'`",
-            FutureWarning,
-        )
-        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
-
-    # Instantiate custom data collator
-    data_collator = DataCollatorCTCWithPadding(
-        processor=processor,
-        feature_extractor_input_name=feature_extractor_input_name,
-        pad_to_multiple_of=int(max_input_length),
-        pad_to_multiple_of_labels=500,
-    )
-
-    # Initialize Trainer
-    trainer = GaudiTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        data_collator=data_collator,
-        args=training_args,
-        compute_metrics=compute_metrics,
-        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
-        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=processor,
-        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-    )
-
-    # 8. Finally, we can start training
-
-    # Training
-    if training_args.do_train:
-        # use last checkpoint if exist
-        if last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        elif os.path.isdir(model_args.model_name_or_path):
-            checkpoint = model_args.model_name_or_path
-        else:
-            checkpoint = None
-
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples
-            if data_args.max_train_samples is not None
-            else len(vectorized_datasets["train"])
-        )
-        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-        max_eval_samples = (
-            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
-        )
-        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Write model card and (optionally) push to hub
-    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
-    kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
-        "tasks": "automatic-speech-recognition",
-        "tags": ["automatic-speech-recognition", data_args.dataset_name],
-        "dataset_args": (
-            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
-            f" {data_args.eval_split_name}"
-        ),
-        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
-    }
-    if "common_voice" in data_args.dataset_name:
-        kwargs["language"] = config_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py
deleted file mode 100755
index 06733f8e7c..0000000000
--- a/examples/speech-recognition/run_speech_recognition_seq2seq.py
+++ /dev/null
@@ -1,662 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for sequence to sequence speech recognition.
-"""
-# You can also adapt this script on your own sequence to sequence speech
-# recognition task. Pointers for this are left as comments.
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
-
-import datasets
-import evaluate
-import torch
-import transformers
-from datasets import DatasetDict, load_dataset
-from transformers import (
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoModelForSpeechSeq2Seq,
-    AutoProcessor,
-    AutoTokenizer,
-    HfArgumentParser,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainer, GaudiSeq2SeqTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    feature_extractor_name: Optional[str] = field(
-        default=None, metadata={"help": "feature extractor name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    freeze_feature_encoder: bool = field(
-        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
-    )
-    freeze_encoder: bool = field(
-        default=False, metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."}
-    )
-    forced_decoder_ids: List[List[int]] = field(
-        default=None,
-        metadata={"help": "Deprecated. Please use the `language` and `task` arguments instead."},
-    )
-    suppress_tokens: List[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Deprecated. The use of `suppress_tokens` should not be required for the majority of fine-tuning examples."
-                "Should you need to use `suppress_tokens`, please manually update them in the fine-tuning script directly."
-            )
-        },
-    )
-    apply_spec_augment: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to apply *SpecAugment* data augmentation to the input features. This is currently only relevant for Wav2Vec2, HuBERT, WavLM and Whisper models."
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: str = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    audio_column_name: str = field(
-        default="audio",
-        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
-    )
-    text_column_name: str = field(
-        default="text",
-        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
-    )
-    max_duration_in_seconds: float = field(
-        default=20.0,
-        metadata={
-            "help": (
-                "Truncate audio files that are longer than `max_duration_in_seconds` seconds to"
-                " 'max_duration_in_seconds`"
-            )
-        },
-    )
-    min_duration_in_seconds: float = field(
-        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
-    )
-    preprocessing_only: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to only do data preprocessing and skip training. This is especially useful when data"
-                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
-                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
-                " can consequently be loaded in distributed training"
-            )
-        },
-    )
-    train_split_name: str = field(
-        default="train",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    eval_split_name: str = field(
-        default="test",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    do_lower_case: bool = field(
-        default=True,
-        metadata={"help": "Whether the target text should be lower cased."},
-    )
-    language: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "Language for multilingual fine-tuning. This argument should be set for multilingual fine-tuning "
-                "only. For English speech recognition, it should be set to `None`."
-            )
-        },
-    )
-    task: str = field(
-        default="transcribe",
-        metadata={"help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."},
-    )
-    label_features_max_length: int = field(
-        default=None,
-        metadata={"help": "Max length for padding label features."},
-    )
-
-
-@dataclass
-class DataCollatorSpeechSeq2SeqWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received.
-    Args:
-        processor ([`WhisperProcessor`])
-            The processor used for processing the data.
-        decoder_start_token_id (`int`)
-            The begin-of-sentence of the decoder.
-        forward_attention_mask (`bool`)
-            Whether to return attention_mask.
-    """
-
-    processor: Any
-    decoder_start_token_id: int
-    forward_attention_mask: bool
-    label_features_max_length: int
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-        model_input_name = self.processor.model_input_names[0]
-        input_features = [{model_input_name: feature[model_input_name]} for feature in features]
-        label_features = [{"input_ids": feature["labels"]} for feature in features]
-
-        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
-
-        if self.forward_attention_mask:
-            batch["attention_mask"] = torch.LongTensor([feature["attention_mask"] for feature in features])
-
-        kwargs = {}
-        if self.label_features_max_length is not None:
-            kwargs["padding"] = "max_length"
-            kwargs["max_length"] = self.label_features_max_length
-        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt", **kwargs)
-
-        # replace padding with -100 to ignore loss correctly
-        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-        # if bos token is appended in previous tokenization step,
-        # cut bos token here as it's append later anyways
-        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
-            labels = labels[:, 1:]
-
-        batch["labels"] = labels
-
-        return batch
-
-
-def main():
-    # 1. Parse input arguments
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiSeq2SeqTrainingArguments))
-
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args)
-
-    # 2. Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # 3. Detecting last checkpoint and eventually continue from last checkpoint
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # 4. Load dataset
-    raw_datasets = DatasetDict()
-
-    if training_args.do_train:
-        raw_datasets["train"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=data_args.train_split_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-
-    if training_args.do_eval:
-        raw_datasets["eval"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=data_args.eval_split_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-
-    if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
-        raise ValueError(
-            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--audio_column_name` to the correct audio column - one of "
-            f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
-        )
-
-    if data_args.text_column_name not in next(iter(raw_datasets.values())).column_names:
-        raise ValueError(
-            f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--text_column_name` to the correct text column - one of "
-            f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
-        )
-
-    # 5. Load pretrained model, tokenizer, and feature extractor
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    # SpecAugment for whisper models
-    if getattr(config, "model_type", None) == "whisper":
-        config.update({"apply_spec_augment": model_args.apply_spec_augment})
-
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        model_args.model_name_or_path,
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-
-    if model_args.freeze_feature_encoder:
-        model.freeze_feature_encoder()
-
-    if model_args.freeze_encoder:
-        model.freeze_encoder()
-        model.model.encoder.gradient_checkpointing = False
-
-    if hasattr(model.generation_config, "is_multilingual") and model.generation_config.is_multilingual:
-        # We only need to set the language and task ids in a multilingual setting
-        tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
-        model.generation_config.language = data_args.language
-        model.generation_config.task = data_args.task
-    elif data_args.language is not None:
-        raise ValueError(
-            "Setting language token for an English-only checkpoint is not permitted. The language argument should "
-            "only be set for multilingual checkpoints."
-        )
-
-    # TODO (Sanchit): deprecate these arguments in v4.41
-    if model_args.forced_decoder_ids is not None:
-        logger.warning(
-            "The use of `forced_decoder_ids` is deprecated and will be removed in v4.41."
-            "Please use the `language` and `task` arguments instead"
-        )
-    else:
-        model.generation_config.forced_decoder_ids = None
-        model.config.forced_decoder_ids = None
-
-    if model_args.suppress_tokens is not None:
-        logger.warning(
-            "The use of `suppress_tokens` is deprecated and will be removed in v4.41."
-            "Should you need `suppress_tokens`, please manually set them in the fine-tuning script."
-        )
-        model.generation_config.suppress_tokens = model_args.suppress_tokens
-
-    # 6. Resample speech dataset if necessary
-    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
-    if dataset_sampling_rate != feature_extractor.sampling_rate:
-        logger.warning(
-            f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one"
-            f" ({feature_extractor.sampling_rate}).Data resampling should be done."
-        )
-        raw_datasets = raw_datasets.cast_column(
-            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
-        )
-
-    # 7. Preprocessing the datasets.
-    # We need to read the audio files as arrays and tokenize the targets.
-    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
-    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
-    audio_column_name = data_args.audio_column_name
-    num_workers = data_args.preprocessing_num_workers
-    text_column_name = data_args.text_column_name
-    model_input_name = feature_extractor.model_input_names[0]
-    do_lower_case = data_args.do_lower_case
-    # if SpecAugment is used for whisper models, return attention_mask to guide the mask along time axis
-    forward_attention_mask = (
-        getattr(config, "model_type", None) == "whisper"
-        and getattr(config, "apply_spec_augment", False)
-        and getattr(config, "mask_time_prob", 0) > 0
-    )
-
-    if data_args.max_train_samples is not None:
-        raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
-
-    if data_args.max_eval_samples is not None:
-        raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
-
-    def prepare_dataset(batch):
-        # process audio
-        sample = batch[audio_column_name]
-        inputs = feature_extractor(
-            sample["array"], sampling_rate=sample["sampling_rate"], return_attention_mask=forward_attention_mask
-        )
-        # process audio length
-        batch[model_input_name] = inputs.get(model_input_name)[0]
-        batch["input_length"] = len(sample["array"])
-        if forward_attention_mask:
-            batch["attention_mask"] = inputs.get("attention_mask")[0]
-
-        # process targets
-        input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
-        batch["labels"] = tokenizer(input_str).input_ids
-        return batch
-
-    with training_args.main_process_first(desc="dataset map pre-processing"):
-        vectorized_datasets = raw_datasets.map(
-            prepare_dataset,
-            remove_columns=next(iter(raw_datasets.values())).column_names,
-            num_proc=data_args.preprocessing_num_workers,
-            desc="preprocess train dataset",
-        )
-
-    # filter data that is shorter than min_input_length or longer than
-    # max_input_length
-    def is_audio_in_length_range(length):
-        return length > min_input_length and length < max_input_length
-
-    vectorized_datasets = vectorized_datasets.filter(
-        is_audio_in_length_range,
-        num_proc=num_workers,
-        input_columns=["input_length"],
-    )
-
-    # for large datasets it is advised to run the preprocessing on a
-    # single machine first with `args.preprocessing_only` since there will mostly likely
-    # be a timeout when running the script in distributed mode.
-    # In a second step `args.preprocessing_only` can then be set to `False` to load the
-    # cached dataset
-    if data_args.preprocessing_only:
-        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
-        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
-        return
-
-    # 8. Load Metric
-    metric = evaluate.load("wer", cache_dir=model_args.cache_dir)
-
-    def compute_metrics(pred):
-        pred_ids = pred.predictions
-
-        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
-
-        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
-        # we do not want to group tokens when computing the metrics
-        label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
-
-        wer = metric.compute(predictions=pred_str, references=label_str)
-
-        return {"wer": wer}
-
-    # 9. Create a single speech processor
-    # make sure all processes wait until data is saved
-    with training_args.main_process_first():
-        # only the main process saves them
-        if is_main_process(training_args.local_rank):
-            # save feature extractor, tokenizer and config
-            feature_extractor.save_pretrained(training_args.output_dir)
-            tokenizer.save_pretrained(training_args.output_dir)
-            config.save_pretrained(training_args.output_dir)
-
-    processor = AutoProcessor.from_pretrained(training_args.output_dir)
-
-    # 10. Define data collator
-    data_collator = DataCollatorSpeechSeq2SeqWithPadding(
-        processor=processor,
-        decoder_start_token_id=model.config.decoder_start_token_id,
-        forward_attention_mask=forward_attention_mask,
-        label_features_max_length=data_args.label_features_max_length,
-    )
-
-    # 11. Initialize Trainer
-    trainer = GaudiSeq2SeqTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
-        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-    )
-
-    # 12. Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the feature extractor too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples
-            if data_args.max_train_samples is not None
-            else len(vectorized_datasets["train"])
-        )
-        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # 13. Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate(
-            metric_key_prefix="eval",
-            max_length=training_args.generation_max_length,
-            num_beams=training_args.generation_num_beams,
-        )
-        max_eval_samples = (
-            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
-        )
-        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # 14. Write Training Stats
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "automatic-speech-recognition"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md
deleted file mode 100644
index 5f33c6fb7e..0000000000
--- a/examples/stable-diffusion/README.md
+++ /dev/null
@@ -1,325 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Stable Diffusion Examples
-
-This directory contains a script that showcases how to perform text-to-image generation using Stable Diffusion on Intel® Gaudi® AI Accelerators.
-
-Stable Diffusion was proposed in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser and Robin Rombach and the Stability AI team.
-
-
-## Text-to-image Generation
-
-### Single Prompt
-
-Here is how to generate images with one prompt:
-```python
-python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --prompts "An image of a squirrel in Picasso style" \
-    --num_images_per_prompt 20 \
-    --batch_size 4 \
-    --image_save_dir /tmp/stable_diffusion_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-> HPU graphs are recommended when generating images by batches to get the fastest possible generations.
-> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
-> You can enable this mode with `--use_hpu_graphs`.
-
-
-### Multiple Prompts
-
-Here is how to generate images with several prompts:
-```python
-python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --prompts "An image of a squirrel in Picasso style" "A shiny flying horse taking off" \
-    --num_images_per_prompt 20 \
-    --batch_size 8 \
-    --image_save_dir /tmp/stable_diffusion_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-> HPU graphs are recommended when generating images by batches to get the fastest possible generations.
-> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
-> You can enable this mode with `--use_hpu_graphs`.
-
-
-### Stable Diffusion 2
-
-[Stable Diffusion 2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_2) can also be used to generate images with this script. Here is an example for a single prompt:
-
-```python
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-2-1 \
-    --prompts "An image of a squirrel in Picasso style" \
-    --num_images_per_prompt 10 \
-    --batch_size 2 \
-    --height 768 \
-    --width 768 \
-    --image_save_dir /tmp/stable_diffusion_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion-2
-```
-
-> There are two different checkpoints for Stable Diffusion 2:
-> - use [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) for generating 768x768 images
-> - use [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) for generating 512x512 images
-
-
-### Latent Diffusion Model for 3D (LDM3D)
-
-[LDM3D](https://arxiv.org/abs/2305.10853) generates both image and depth map data from a given text prompt, allowing users to generate RGBD images from text prompts.
-
-[Original checkpoint](https://huggingface.co/Intel/ldm3d) and [latest checkpoint](https://huggingface.co/Intel/ldm3d-4c) are open source.
-A [demo](https://huggingface.co/spaces/Intel/ldm3d) is also available. Here is how to run this model:
-
-```python
-python text_to_image_generation.py \
-    --model_name_or_path "Intel/ldm3d-4c" \
-    --prompts "An image of a squirrel in Picasso style" \
-    --num_images_per_prompt 10 \
-    --batch_size 2 \
-    --height 768 \
-    --width 768 \
-    --image_save_dir /tmp/stable_diffusion_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion-2 \
-    --ldm3d
-```
-
-> There are three different checkpoints for LDM3D:
-> - use [original checkpoint](https://huggingface.co/Intel/ldm3d) to generate outputs from the paper
-> - use [the latest checkpoint](https://huggingface.co/Intel/ldm3d-4c) for generating improved results
-> - use [the pano checkpoint](https://huggingface.co/Intel/ldm3d-pano) to generate panoramic view
-
-### Stable Diffusion XL (SDXL)
-
-Stable Diffusion XL was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/pdf/2307.01952.pdf) by the Stability AI team.
-
-Here is how to generate SDXL images with a single prompt:
-```python
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-    --prompts "Sailing ship painting by Van Gogh" \
-    --num_images_per_prompt 20 \
-    --batch_size 4 \
-    --image_save_dir /tmp/stable_diffusion_xl_images \
-    --scheduler euler_discrete \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-> HPU graphs are recommended when generating images by batches to get the fastest possible generations.
-> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
-> You can enable this mode with `--use_hpu_graphs`.
-
-Here is how to generate SDXL images with several prompts:
-```python
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-    --prompts "Sailing ship painting by Van Gogh" "A shiny flying horse taking off" \
-    --num_images_per_prompt 20 \
-    --batch_size 8 \
-    --image_save_dir /tmp/stable_diffusion_xl_images \
-    --scheduler euler_discrete \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-SDXL combines a second text encoder (OpenCLIP ViT-bigG/14) with the original text encoder to significantly
-increase the number of parameters. Here is how to generate images with several prompts for both `prompt`
-and `prompt_2` (2nd text encoder), as well as their negative prompts:
-```python
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-    --prompts "Sailing ship painting by Van Gogh" "A shiny flying horse taking off" \
-    --prompts_2 "Red tone" "Blue tone" \
-    --negative_prompts "Low quality" "Sketch" \
-    --negative_prompts_2 "Clouds" "Clouds" \
-    --num_images_per_prompt 20 \
-    --batch_size 8 \
-    --image_save_dir /tmp/stable_diffusion_xl_images \
-    --scheduler euler_discrete \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-> HPU graphs are recommended when generating images by batches to get the fastest possible generations.
-> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
-> You can enable this mode with `--use_hpu_graphs`.
-
-### SDXL-Turbo
-SDXL-Turbo is a distilled version of SDXL 1.0, trained for real-time synthesis.
-
-Here is how to generate images with multiple prompts:
-```bash
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/sdxl-turbo \
-    --prompts "Sailing ship painting by Van Gogh" "A shiny flying horse taking off" \
-    --num_images_per_prompt 20 \
-    --batch_size 8 \
-    --image_save_dir /tmp/stable_diffusion_xl_turbo_images \
-    --scheduler euler_ancestral_discrete \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16 \
-    --num_inference_steps 1 \
-    --guidance_scale 0.0 \
-    --timestep_spacing trailing
-```
-
-> HPU graphs are recommended when generating images by batches to get the fastest possible generations.
-> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
-> You can enable this mode with `--use_hpu_graphs`.
-
-> Please note: there is a regression with "--guidance_scale 0.0" for the latest release.
-
- 
-### ControlNet
-
-ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models ](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala.
-It is a type of model for controlling StableDiffusion by conditioning the model with an additional input image.
-
-Here is how to generate images conditioned by canny edge model:
-```bash
-pip install -r requirements.txt
-python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --prompts "futuristic-looking woman" \
-    --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \
-    --num_images_per_prompt 20 \
-    --batch_size 4 \
-    --image_save_dir /tmp/controlnet_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-Here is how to generate images conditioned by canny edge model and with multiple prompts:
-```bash
-pip install -r requirements.txt
-python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --prompts "futuristic-looking woman" "a rusty robot" \
-    --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \
-    --num_images_per_prompt 10 \
-    --batch_size 4 \
-    --image_save_dir /tmp/controlnet_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-Here is how to generate images conditioned by open pose model:
-```bash
-pip install -r requirements.txt
-python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --controlnet_model_name_or_path lllyasviel/sd-controlnet-openpose \
-    --prompts "Chef in the kitchen" \
-    --control_image https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png \
-    --control_preprocessing_type "none" \
-    --num_images_per_prompt 20 \
-    --batch_size 4 \
-    --image_save_dir /tmp/controlnet_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-Here is how to generate images with conditioned by canny edge model using Stable Diffusion 2
-```bash
-pip install -r requirements.txt
-python text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-2-1 \
-    --controlnet_model_name_or_path thibaud/controlnet-sd21-canny-diffusers \
-    --control_image https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png \
-    --control_preprocessing_type "none" \
-    --prompts "bird" \
-    --seed 0 \
-    --num_images_per_prompt 10 \
-    --batch_size 2 \
-    --image_save_dir /tmp/controlnet-2-1_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion-2
-```
-
-# Stable Video Diffusion Examples
-
-Stable Video Diffusion (SVD) was unveiled in [Stable Video Diffusion Announcement](https://stability.ai/news/stable-video-diffusion-open-ai-video-model)
-by the Stability AI team. Stable Video Diffusion XT version (SVD-XT) is tuned to generate 25 frames of video from a single image.
-
-## Image-to-video Generation
-
-Script `image_to_video_generation.py` showcases how to perform image-to-video generation using Stable Video Diffusion on Intel Gaudi.
-
-### Single Image Prompt
-
-Here is how to generate video with one image prompt:
-```bash
-python image_to_video_generation.py \
-    --model_name_or_path "stabilityai/stable-video-diffusion-img2vid-xt" \
-    --image_path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png" \
-    --num_videos_per_prompt 1 \
-    --video_save_dir /tmp/stable_video_diffusion_xt \
-    --save_frames_as_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-### Multiple Image Prompts
-
-Here is how to generate videos with several image prompts:
-```bash
-python image_to_video_generation.py \
-    --model_name_or_path "stabilityai/stable-video-diffusion-img2vid-xt" \
-    --image_path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png" \
-                 "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" \
-                 "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" \
-                 "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" \
-    --num_videos_per_prompt 1 \
-    --video_save_dir /tmp/stable_video_diffusion_xt \
-    --save_frames_as_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py
deleted file mode 100755
index 7beb73a1ac..0000000000
--- a/examples/stable-diffusion/image_to_video_generation.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import argparse
-import logging
-import sys
-from pathlib import Path
-
-import torch
-from diffusers.utils import export_to_video, load_image
-
-from optimum.habana.diffusers import GaudiEulerDiscreteScheduler
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.8.1")
-
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name_or_path",
-        default="stabilityai/stable-video-diffusion-img2vid-xt",
-        type=str,
-        help="Path to pre-trained model",
-    )
-
-    # Pipeline arguments
-    parser.add_argument(
-        "--image_path",
-        type=str,
-        default=None,
-        nargs="*",
-        help="Path to input image(s) to guide video generation",
-    )
-    parser.add_argument(
-        "--num_videos_per_prompt", type=int, default=1, help="The number of videos to generate per prompt image."
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="The number of videos in a batch.")
-    parser.add_argument("--height", type=int, default=576, help="The height in pixels of the generated video.")
-    parser.add_argument("--width", type=int, default=1024, help="The width in pixels of the generated video.")
-    parser.add_argument(
-        "--num_inference_steps",
-        type=int,
-        default=25,
-        help=(
-            "The number of denoising steps. More denoising steps usually lead to a higher quality images at the expense"
-            " of slower inference."
-        ),
-    )
-    parser.add_argument(
-        "--min_guidance_scale",
-        type=float,
-        default=1.0,
-        help="The minimum guidance scale. Used for the classifier free guidance with first frame.",
-    )
-    parser.add_argument(
-        "--max_guidance_scale",
-        type=float,
-        default=3.0,
-        help="The maximum guidance scale. Used for the classifier free guidance with last frame.",
-    )
-    parser.add_argument(
-        "--fps",
-        type=int,
-        default=7,
-        help=(
-            "Frames per second. The rate at which the generated images shall be exported to a video after generation."
-            " Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training."
-        ),
-    )
-    parser.add_argument(
-        "--motion_bucket_id",
-        type=int,
-        default=127,
-        help=(
-            "The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion"
-            " will be in the video."
-        ),
-    )
-    parser.add_argument(
-        "--noise_aug_strength",
-        type=float,
-        default=0.02,
-        help=(
-            "The amount of noise added to the init image, the higher it is the less the video will look like the"
-            " init image. Increase it for more motion."
-        ),
-    )
-    parser.add_argument(
-        "--decode_chunk_size",
-        type=int,
-        default=None,
-        help=(
-            "The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency"
-            " between frames, but also the higher the memory consumption. By default, the decoder will decode all"
-            " frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage."
-        ),
-    )
-    parser.add_argument(
-        "--output_type",
-        type=str,
-        choices=["pil", "np"],
-        default="pil",
-        help="Whether to return PIL images or Numpy arrays.",
-    )
-    parser.add_argument(
-        "--pipeline_save_dir",
-        type=str,
-        default=None,
-        help="The directory where the generation pipeline will be saved.",
-    )
-    parser.add_argument(
-        "--video_save_dir",
-        type=str,
-        default="./stable-video-diffusion-generated-frames",
-        help="The directory where frames will be saved.",
-    )
-    parser.add_argument(
-        "--save_frames_as_images",
-        action="store_true",
-        help="Save output frames as images",
-    )
-
-    parser.add_argument("--seed", type=int, default=42, help="Random seed for initialization.")
-
-    # HPU-specific arguments
-    parser.add_argument("--use_habana", action="store_true", help="Use HPU.")
-    parser.add_argument(
-        "--use_hpu_graphs", action="store_true", help="Use HPU graphs on HPU. This should lead to faster generations."
-    )
-    parser.add_argument(
-        "--gaudi_config_name",
-        type=str,
-        default="Habana/stable-diffusion",
-        help=(
-            "Name or path of the Gaudi configuration. In particular, it enables to specify how to apply Habana Mixed"
-            " Precision."
-        ),
-    )
-    parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
-
-    args = parser.parse_args()
-
-    from optimum.habana.diffusers import GaudiStableVideoDiffusionPipeline
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO)
-
-    # Initialize the scheduler and the generation pipeline
-    scheduler = GaudiEulerDiscreteScheduler.from_pretrained(args.model_name_or_path, subfolder="scheduler")
-    kwargs = {
-        "scheduler": scheduler,
-        "use_habana": args.use_habana,
-        "use_hpu_graphs": args.use_hpu_graphs,
-        "gaudi_config": args.gaudi_config_name,
-    }
-    if args.bf16:
-        kwargs["torch_dtype"] = torch.bfloat16
-
-    pipeline = GaudiStableVideoDiffusionPipeline.from_pretrained(
-        args.model_name_or_path,
-        **kwargs,
-    )
-
-    # Set seed before running the model
-    set_seed(args.seed)
-
-    # Load input image(s)
-    input = []
-    logger.info("Input image(s):")
-    if isinstance(args.image_path, str):
-        args.image_path = [args.image_path]
-    for image_path in args.image_path:
-        image = load_image(image_path)
-        image = image.resize((args.height, args.width))
-        input.append(image)
-        logger.info(image_path)
-
-    # Generate images
-    outputs = pipeline(
-        image=input,
-        num_videos_per_prompt=args.num_videos_per_prompt,
-        batch_size=args.batch_size,
-        height=args.height,
-        width=args.width,
-        num_inference_steps=args.num_inference_steps,
-        min_guidance_scale=args.min_guidance_scale,
-        max_guidance_scale=args.max_guidance_scale,
-        fps=args.fps,
-        motion_bucket_id=args.motion_bucket_id,
-        noise_aug_strength=args.noise_aug_strength,
-        decode_chunk_size=args.decode_chunk_size,
-        output_type=args.output_type,
-    )
-
-    # Save the pipeline in the specified directory if not None
-    if args.pipeline_save_dir is not None:
-        pipeline.save_pretrained(args.pipeline_save_dir)
-
-    # Save images in the specified directory if not None and if they are in PIL format
-    if args.video_save_dir is not None:
-        if args.output_type == "pil":
-            video_save_dir = Path(args.video_save_dir)
-            video_save_dir.mkdir(parents=True, exist_ok=True)
-            logger.info(f"Saving video frames in {video_save_dir.resolve()}...")
-            for i, frames in enumerate(outputs.frames):
-                export_to_video(frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".mp4", fps=7)
-                if args.save_frames_as_images:
-                    for j, frame in enumerate(frames):
-                        frame.save(
-                            args.video_save_dir
-                            + "/gen_video_"
-                            + str(i).zfill(2)
-                            + "_frame_"
-                            + str(j).zfill(2)
-                            + ".png"
-                        )
-        else:
-            logger.warning("--output_type should be equal to 'pil' to save frames in --video_save_dir.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/stable-diffusion/requirements.txt b/examples/stable-diffusion/requirements.txt
deleted file mode 100644
index 437dc81f19..0000000000
--- a/examples/stable-diffusion/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-opencv-python == 4.10.0.82
diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
deleted file mode 100755
index e52fc5fcba..0000000000
--- a/examples/stable-diffusion/text_to_image_generation.py
+++ /dev/null
@@ -1,434 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import argparse
-import logging
-import sys
-from pathlib import Path
-
-import numpy as np
-import torch
-
-from optimum.habana.diffusers import (
-    GaudiDDIMScheduler,
-    GaudiEulerAncestralDiscreteScheduler,
-    GaudiEulerDiscreteScheduler,
-)
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.11.0")
-
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name_or_path",
-        default="runwayml/stable-diffusion-v1-5",
-        type=str,
-        help="Path to pre-trained model",
-    )
-
-    parser.add_argument(
-        "--controlnet_model_name_or_path",
-        default="lllyasviel/sd-controlnet-canny",
-        type=str,
-        help="Path to pre-trained model",
-    )
-
-    parser.add_argument(
-        "--scheduler",
-        default="ddim",
-        choices=["euler_discrete", "euler_ancestral_discrete", "ddim"],
-        type=str,
-        help="Name of scheduler",
-    )
-
-    parser.add_argument(
-        "--timestep_spacing",
-        default="linspace",
-        choices=["linspace", "leading", "trailing"],
-        type=str,
-        help="The way the timesteps should be scaled.",
-    )
-    # Pipeline arguments
-    parser.add_argument(
-        "--prompts",
-        type=str,
-        nargs="*",
-        default="An image of a squirrel in Picasso style",
-        help="The prompt or prompts to guide the image generation.",
-    )
-    parser.add_argument(
-        "--prompts_2",
-        type=str,
-        nargs="*",
-        default=None,
-        help="The second prompt or prompts to guide the image generation (applicable to SDXL).",
-    )
-    parser.add_argument(
-        "--control_image",
-        type=str,
-        default=None,
-        help=("Path to the controlnet conditioning image"),
-    )
-    parser.add_argument(
-        "--control_preprocessing_type",
-        type=str,
-        default="canny",
-        help=(
-            "The type of preprocessing to apply on contol image. Only `canny` is supported."
-            " Defaults to `canny`. Set to unsupported value to disable preprocessing."
-        ),
-    )
-    parser.add_argument(
-        "--num_images_per_prompt", type=int, default=1, help="The number of images to generate per prompt."
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="The number of images in a batch.")
-    parser.add_argument(
-        "--height",
-        type=int,
-        default=0,
-        help="The height in pixels of the generated images (0=default from model config).",
-    )
-    parser.add_argument(
-        "--width",
-        type=int,
-        default=0,
-        help="The width in pixels of the generated images (0=default from model config).",
-    )
-    parser.add_argument(
-        "--num_inference_steps",
-        type=int,
-        default=50,
-        help=(
-            "The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense"
-            " of slower inference."
-        ),
-    )
-    parser.add_argument(
-        "--guidance_scale",
-        type=float,
-        default=7.5,
-        help=(
-            "Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598)."
-            " Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,"
-            " usually at the expense of lower image quality."
-        ),
-    )
-    parser.add_argument(
-        "--negative_prompts",
-        type=str,
-        nargs="*",
-        default=None,
-        help="The prompt or prompts not to guide the image generation.",
-    )
-    parser.add_argument(
-        "--negative_prompts_2",
-        type=str,
-        nargs="*",
-        default=None,
-        help="The second prompt or prompts not to guide the image generation (applicable to SDXL).",
-    )
-    parser.add_argument(
-        "--eta",
-        type=float,
-        default=0.0,
-        help="Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502.",
-    )
-    parser.add_argument(
-        "--output_type",
-        type=str,
-        choices=["pil", "np"],
-        default="pil",
-        help="Whether to return PIL images or Numpy arrays.",
-    )
-
-    parser.add_argument(
-        "--pipeline_save_dir",
-        type=str,
-        default=None,
-        help="The directory where the generation pipeline will be saved.",
-    )
-    parser.add_argument(
-        "--image_save_dir",
-        type=str,
-        default="./stable-diffusion-generated-images",
-        help="The directory where images will be saved.",
-    )
-
-    parser.add_argument("--seed", type=int, default=42, help="Random seed for initialization.")
-
-    # HPU-specific arguments
-    parser.add_argument("--use_habana", action="store_true", help="Use HPU.")
-    parser.add_argument(
-        "--use_hpu_graphs", action="store_true", help="Use HPU graphs on HPU. This should lead to faster generations."
-    )
-    parser.add_argument(
-        "--gaudi_config_name",
-        type=str,
-        default="Habana/stable-diffusion",
-        help=(
-            "Name or path of the Gaudi configuration. In particular, it enables to specify how to apply Habana Mixed"
-            " Precision."
-        ),
-    )
-    parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
-    parser.add_argument(
-        "--ldm3d", action="store_true", help="Use LDM3D to generate an image and a depth map from a given text prompt."
-    )
-    parser.add_argument(
-        "--throughput_warmup_steps",
-        type=int,
-        default=None,
-        help="Number of steps to ignore for throughput calculation.",
-    )
-    parser.add_argument(
-        "--profiling_warmup_steps",
-        type=int,
-        default=0,
-        help="Number of steps to ignore for profiling.",
-    )
-    parser.add_argument(
-        "--profiling_steps",
-        type=int,
-        default=0,
-        help="Number of steps to capture for profiling.",
-    )
-    parser.add_argument(
-        "--unet_adapter_name_or_path",
-        default=None,
-        type=str,
-        help="Path to pre-trained model",
-    )
-    parser.add_argument(
-        "--text_encoder_adapter_name_or_path",
-        default=None,
-        type=str,
-        help="Path to pre-trained model",
-    )
-    parser.add_argument(
-        "--lora_id",
-        default=None,
-        type=str,
-        help="Path to lora id",
-    )
-    args = parser.parse_args()
-
-    # Set image resolution
-    kwargs_call = {}
-    if args.width > 0 and args.height > 0:
-        kwargs_call["width"] = args.width
-        kwargs_call["height"] = args.height
-
-    # ControlNet
-    if args.control_image is not None:
-        from diffusers.utils import load_image
-        from PIL import Image
-
-        # get control image
-        control_image = load_image(args.control_image)
-        if args.control_preprocessing_type == "canny":
-            import cv2
-
-            image = np.array(control_image)
-            # get canny image
-            image = cv2.Canny(image, 100, 200)
-            image = image[:, :, None]
-            image = np.concatenate([image, image, image], axis=2)
-            control_image = Image.fromarray(image)
-
-    # Import selected pipeline
-    sdxl_models = ["stable-diffusion-xl", "sdxl"]
-
-    if args.control_image is not None:
-        from diffusers import ControlNetModel
-
-        from optimum.habana.diffusers import GaudiStableDiffusionControlNetPipeline
-
-        sdxl = False
-    elif any(model in args.model_name_or_path for model in sdxl_models):
-        from optimum.habana.diffusers import GaudiStableDiffusionXLPipeline
-
-        sdxl = True
-    else:
-        if args.ldm3d:
-            from optimum.habana.diffusers import GaudiStableDiffusionLDM3DPipeline as GaudiStableDiffusionPipeline
-
-            if args.model_name_or_path == "runwayml/stable-diffusion-v1-5":
-                args.model_name_or_path = "Intel/ldm3d-4c"
-        else:
-            from optimum.habana.diffusers import GaudiStableDiffusionPipeline
-        sdxl = False
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO)
-
-    # Initialize the scheduler and the generation pipeline
-    kwargs = {"timestep_spacing": args.timestep_spacing}
-    if args.scheduler == "euler_discrete":
-        scheduler = GaudiEulerDiscreteScheduler.from_pretrained(
-            args.model_name_or_path, subfolder="scheduler", **kwargs
-        )
-    elif args.scheduler == "euler_ancestral_discrete":
-        scheduler = GaudiEulerAncestralDiscreteScheduler.from_pretrained(
-            args.model_name_or_path, subfolder="scheduler", **kwargs
-        )
-    else:
-        scheduler = GaudiDDIMScheduler.from_pretrained(args.model_name_or_path, subfolder="scheduler", **kwargs)
-
-    kwargs = {
-        "scheduler": scheduler,
-        "use_habana": args.use_habana,
-        "use_hpu_graphs": args.use_hpu_graphs,
-        "gaudi_config": args.gaudi_config_name,
-    }
-
-    if args.bf16:
-        kwargs["torch_dtype"] = torch.bfloat16
-
-    if args.throughput_warmup_steps is not None:
-        kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps
-
-    # Generate images
-    if args.control_image is not None:
-        model_dtype = torch.bfloat16 if args.bf16 else None
-        controlnet = ControlNetModel.from_pretrained(args.controlnet_model_name_or_path, torch_dtype=model_dtype)
-        pipeline = GaudiStableDiffusionControlNetPipeline.from_pretrained(
-            args.model_name_or_path,
-            controlnet=controlnet,
-            **kwargs,
-        )
-        if args.lora_id:
-            pipeline.load_lora_weights(args.lora_id)
-
-        # Set seed before running the model
-        set_seed(args.seed)
-
-        outputs = pipeline(
-            prompt=args.prompts,
-            image=control_image,
-            num_images_per_prompt=args.num_images_per_prompt,
-            batch_size=args.batch_size,
-            num_inference_steps=args.num_inference_steps,
-            guidance_scale=args.guidance_scale,
-            negative_prompt=args.negative_prompts,
-            eta=args.eta,
-            output_type=args.output_type,
-            profiling_warmup_steps=args.profiling_warmup_steps,
-            profiling_steps=args.profiling_steps,
-            **kwargs_call,
-        )
-    elif sdxl:
-        pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
-            args.model_name_or_path,
-            **kwargs,
-        )
-        if args.lora_id:
-            pipeline.load_lora_weights(args.lora_id)
-
-        # Set seed before running the model
-        set_seed(args.seed)
-
-        outputs = pipeline(
-            prompt=args.prompts,
-            prompt_2=args.prompts_2,
-            num_images_per_prompt=args.num_images_per_prompt,
-            batch_size=args.batch_size,
-            num_inference_steps=args.num_inference_steps,
-            guidance_scale=args.guidance_scale,
-            negative_prompt=args.negative_prompts,
-            negative_prompt_2=args.negative_prompts_2,
-            eta=args.eta,
-            output_type=args.output_type,
-            profiling_warmup_steps=args.profiling_warmup_steps,
-            profiling_steps=args.profiling_steps,
-            **kwargs_call,
-        )
-    else:
-        pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            args.model_name_or_path,
-            **kwargs,
-        )
-        if args.unet_adapter_name_or_path is not None:
-            from peft import PeftModel
-
-            pipeline.unet = PeftModel.from_pretrained(pipeline.unet, args.unet_adapter_name_or_path)
-            pipeline.unet = pipeline.unet.merge_and_unload()
-        if args.text_encoder_adapter_name_or_path is not None:
-            from peft import PeftModel
-
-            pipeline.text_encoder = PeftModel.from_pretrained(
-                pipeline.text_encoder, args.text_encoder_adapter_name_or_path
-            )
-            pipeline.text_encoder = pipeline.text_encoder.merge_and_unload()
-        set_seed(args.seed)
-
-        outputs = pipeline(
-            prompt=args.prompts,
-            num_images_per_prompt=args.num_images_per_prompt,
-            batch_size=args.batch_size,
-            num_inference_steps=args.num_inference_steps,
-            guidance_scale=args.guidance_scale,
-            negative_prompt=args.negative_prompts,
-            eta=args.eta,
-            output_type=args.output_type,
-            profiling_warmup_steps=args.profiling_warmup_steps,
-            profiling_steps=args.profiling_steps,
-            **kwargs_call,
-        )
-
-    # Save the pipeline in the specified directory if not None
-    if args.pipeline_save_dir is not None:
-        pipeline.save_pretrained(args.pipeline_save_dir)
-
-    # Save images in the specified directory if not None and if they are in PIL format
-    if args.image_save_dir is not None:
-        if args.output_type == "pil":
-            image_save_dir = Path(args.image_save_dir)
-            image_save_dir.mkdir(parents=True, exist_ok=True)
-            logger.info(f"Saving images in {image_save_dir.resolve()}...")
-            if args.ldm3d:
-                for i, rgb in enumerate(outputs.rgb):
-                    rgb.save(image_save_dir / f"rgb_{i+1}.png")
-                for i, depth in enumerate(outputs.depth):
-                    depth.save(image_save_dir / f"depth_{i+1}.png")
-            else:
-                for i, image in enumerate(outputs.images):
-                    image.save(image_save_dir / f"image_{i+1}.png")
-        else:
-            logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/stable-diffusion/training/README.md b/examples/stable-diffusion/training/README.md
deleted file mode 100644
index 38a52a62db..0000000000
--- a/examples/stable-diffusion/training/README.md
+++ /dev/null
@@ -1,429 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Stable Diffusion Training Examples
-
-This directory contains scripts that showcase how to perform training/fine-tuning of Stable Diffusion models on Habana Gaudi.
-
-
-## Textual Inversion
-
-[Textual Inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like Stable Diffusion on your own images using just 3-5 examples.
-The `textual_inversion.py` script shows how to implement the training procedure on Habana Gaudi.
-
-
-### Cat toy example
-
-Let's get our dataset. For this example, we will use some cat images: https://huggingface.co/datasets/diffusers/cat_toy_example .
-
-Let's first download it locally:
-
-```py
-from huggingface_hub import snapshot_download
-
-local_dir = "./cat"
-snapshot_download("diffusers/cat_toy_example", local_dir=local_dir, repo_type="dataset", ignore_patterns=".gitattributes")
-```
-
-This will be our training data.
-Now we can launch the training using:
-
-```bash
-python textual_inversion.py \
-  --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-  --train_data_dir ./cat \
-  --learnable_property object \
-  --placeholder_token "<cat-toy>" \
-  --initializer_token toy \
-  --resolution 512 \
-  --train_batch_size 4 \
-  --max_train_steps 3000 \
-  --learning_rate 5.0e-04 \
-  --scale_lr \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir /tmp/textual_inversion_cat \
-  --save_as_full_pipeline \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3
-```
-
-> Change `--resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
-
-> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, *e.g.* `"<cat-toy>"`. However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable parameters. This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to a number larger than one, *e.g.*: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case.
-
-
-## ControlNet Training
-
-ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models ](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala. It is a type of model for controlling StableDiffusion by conditioning the model with an additional input image.
-This example is adapted from [controlnet example in the diffusers repository](https://github.com/huggingface/diffusers/tree/main/examples/controlnet#training).
-
-First, download the conditioning images as shown below:
-
-```bash
-wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png
-wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
-```
-
-Then proceed to training with command:
-
-```bash
-python train_controlnet.py \
- --pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5\
- --output_dir=/tmp/stable_diffusion1_5 \
- --dataset_name=fusing/fill50k \
- --resolution=512 \
- --learning_rate=1e-5 \
- --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
- --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
- --train_batch_size=4 \
- --throughput_warmup_steps=3 \
- --use_hpu_graphs \
- --bf16
-```
-
-### Multi-card Run
-
-You can run these fine-tuning scripts in a distributed fashion as follows:
-```bash
-python ../../gaudi_spawn.py --use_mpi --world_size 8 train_controlnet.py \
-  --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-  --output_dir=/tmp/stable_diffusion1_5 \
-  --dataset_name=fusing/fill50k \
-  --resolution=512 \
-  --learning_rate=1e-5 \
-  --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
-  --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
-  --train_batch_size=4 \
-  --throughput_warmup_steps 3 \
-  --use_hpu_graphs \
-  --bf16
-```
-
-
-### Inference
-
-Once you have trained a model as described right above, inference can be done simply using the `GaudiStableDiffusionPipeline`. Make sure to include the `placeholder_token` in your prompt.
-
-```python
-from diffusers import ControlNetModel, UniPCMultistepScheduler
-from diffusers.utils import load_image
-import torch
-from optimum.habana.diffusers import GaudiStableDiffusionControlNetPipeline
-
-base_model_path = "runwayml/stable-diffusion-v1-5"
-controlnet_path = "/tmp/stable_diffusion1_5"
-
-controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.bfloat16)
-pipe = GaudiStableDiffusionControlNetPipeline.from_pretrained(
-    base_model_path,
-    controlnet=controlnet,
-    torch_dtype=torch.bfloat16,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config="Habana/stable-diffusion",
-)
-
-# speed up diffusion process with faster scheduler and memory optimization
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-
-control_image = load_image("./conditioning_image_1.png")
-prompt = "pale golden rod circle with old lace background"
-
-# generate image
-generator = torch.manual_seed(0)
-image = pipe(
-    prompt, num_inference_steps=20, generator=generator, image=control_image
-).images[0]
-image.save("./output.png")
-```
-
-
-## Fine-Tuning for Stable Diffusion XL
-
-The `train_text_to_image_sdxl.py` script shows how to implement the fine-tuning of Stable Diffusion models on Habana Gaudi.
-
-### Requirements
-
-Install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-### Single-card Training
-
-```bash
-python train_text_to_image_sdxl.py \
-  --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-  --pretrained_vae_model_name_or_path stabilityai/sdxl-vae \
-  --dataset_name lambdalabs/pokemon-blip-captions \
-  --resolution 512 \
-  --crop_resolution 512 \
-  --center_crop \
-  --random_flip \
-  --proportion_empty_prompts=0.2 \
-  --train_batch_size 16 \
-  --max_train_steps 2500 \
-  --learning_rate 1e-05 \
-  --max_grad_norm 1 \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir sdxl-pokemon-model \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3 \
-  --dataloader_num_workers 8 \
-  --bf16 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --validation_prompt="a robotic cat with wings" \
-  --validation_epochs 48 \
-  --checkpointing_steps 2500 \
-  --logging_step 10 \
-  --adjust_throughput
-```
-
-
-### Multi-card Training
-```bash
-PT_HPU_RECIPE_CACHE_CONFIG=/tmp/stdxl_recipe_cache,True,1024  \
-python ../../gaudi_spawn.py --world_size 8 --use_mpi train_text_to_image_sdxl.py \
-  --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-  --pretrained_vae_model_name_or_path stabilityai/sdxl-vae \
-  --dataset_name lambdalabs/pokemon-blip-captions \
-  --resolution 512 \
-  --crop_resolution 512 \
-  --center_crop \
-  --random_flip \
-  --proportion_empty_prompts=0.2 \
-  --train_batch_size 16 \
-  --max_train_steps 336 \
-  --learning_rate 1e-05 \
-  --max_grad_norm 1 \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir sdxl-pokemon-model \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3 \
-  --dataloader_num_workers 8 \
-  --bf16 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --validation_prompt="a robotic cat with wings" \
-  --validation_epochs 48 \
-  --checkpointing_steps 336 \
-  --mediapipe dataset_sdxl_pokemon \
-  --adjust_throughput
-```
-
-### Single-card Training on Gaudi1
-```bash
-PT_HPU_MAX_COMPOUND_OP_SIZE=5 python train_text_to_image_sdxl.py \
-  --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
-  --pretrained_vae_model_name_or_path stabilityai/sdxl-vae \
-  --dataset_name lambdalabs/pokemon-blip-captions \
-  --resolution 512 \
-  --center_crop \
-  --random_flip \
-  --proportion_empty_prompts=0.2 \
-  --train_batch_size 1 \
-  --gradient_accumulation_steps 4 \
-  --max_train_steps 3000 \
-  --learning_rate 1e-05 \
-  --max_grad_norm 1 \
-  --lr_scheduler constant \
-  --lr_warmup_steps 0 \
-  --output_dir sdxl-pokemon-model \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --bf16
-```
-
-> [!NOTE]
-> There is a known issue that in the first 2 steps, graph compilation takes longer than 10 seconds. This will be fixed in a future release.
-
-> [!NOTE]
-> `--mediapipe` only works on Gaudi2.
-
-
-## DreamBooth
-DreamBooth is a method to personalize text-to-image models like Stable Diffusion given just a few (3~5) images of a subject. The `train_dreambooth.py` script shows how to implement the training procedure and adapt it for Stable Diffusion.
-
-### Dog toy example
-
-Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
-
-Let's first download it locally:
-
-```python
-from huggingface_hub import snapshot_download
-
-local_dir = "./dog"
-snapshot_download(
-    "diffusers/dog-example",
-    local_dir=local_dir, repo_type="dataset",
-    ignore_patterns=".gitattributes",
-)
-```
-
-### Full model finetune
-And launch the multi-card training using:
-```bash
-
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export INSTANCE_DIR="dog"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="out"
-
-python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --num_class_images=200 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=800 \
-  --mixed_precision=bf16 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --gaudi_config_name Habana/stable-diffusion \
-  full
-
-```
-Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
-According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time.
-
-### PEFT model finetune
-We provide example for dreambooth to use lora/lokr/loha/oft to finetune unet or text encoder.
-
-**___Note: When using peft method we can use a much higher learning rate compared to vanilla dreambooth. Here we
-use *1e-4* instead of the usual *5e-6*.___**
-
-Launch the multi-card training using:
-```bash
-
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export INSTANCE_DIR="dog"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="out"
-
-python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --num_class_images=200 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=1e-4 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=800 \
-  --mixed_precision=bf16 \
-  --use_hpu_graphs_for_training \
-  --use_hpu_graphs_for_inference \
-  --gaudi_config_name Habana/stable-diffusion \
-  lora --unet_r 8 --unet_alpha 8
-
-```
-Similar command could be applied to loha, lokr, oft.
-You could check each adapter specific args by "--help", like you could use following command to check oft specific args.
-
-```bash
-python3 train_dreambooth.py oft --help
-
-```
-
-**___Note: oft could not work with hpu graphs mode. since "torch.inverse" need to fallback to cpu.
-there's error like "cpu fallback is not supported during hpu graph capturing"___**
-
-
-You could use text_to_image_generation.py to generate picture using the peft adapter like
-
-```bash
-python ../text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5  \
-    --prompts "a sks dog" \
-    --num_images_per_prompt 5 \
-    --batch_size 1 \
-    --image_save_dir /tmp/stable_diffusion_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --unet_adapter_name_or_path out/unet \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
-
-### DreamBooth training example for Stable Diffusion XL
-You could use the dog images as example as well.
-You can launch training using:
-```bash
-export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
-export INSTANCE_DIR="dog"
-export OUTPUT_DIR="lora-trained-xl"
-export VAE_PATH="stabilityai/sdxl-vae"
-
-python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth_lora_sdxl.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --pretrained_vae_model_name_or_path=$VAE_PATH \
-  --output_dir=$OUTPUT_DIR \
-  --mixed_precision="bf16" \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --learning_rate=1e-4 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=25 \
-  --seed=0 \
-  --use_hpu_graphs_for_inference \
-  --use_hpu_graphs_for_training \
-  --gaudi_config_name Habana/stable-diffusion
-
-```
-
-You could use text_to_image_generation.py to generate picture using the peft adapter like
-
-```bash
-python ../text_to_image_generation.py \
-    --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0  \
-    --prompts "A picture of a sks dog in a bucket" \
-    --num_images_per_prompt 5 \
-    --batch_size 1 \
-    --image_save_dir /tmp/stable_diffusion_xl_images \
-    --use_habana \
-    --use_hpu_graphs \
-    --lora_id  lora-trained-xl \
-    --gaudi_config Habana/stable-diffusion \
-    --bf16
-```
\ No newline at end of file
diff --git a/examples/stable-diffusion/training/media_pipe_imgdir.py b/examples/stable-diffusion/training/media_pipe_imgdir.py
deleted file mode 100644
index cf6536f338..0000000000
--- a/examples/stable-diffusion/training/media_pipe_imgdir.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-
-import numpy as np
-import torch
-from datasets import Dataset as DatasetHF
-from torch.distributed import get_rank, get_world_size
-from torch.utils.data.sampler import BatchSampler
-from transformers.trainer_pt_utils import DistributedSamplerWithLoop
-
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-try:
-    from habana_frameworks.mediapipe import fn
-    from habana_frameworks.mediapipe.media_types import (
-        dtype,
-        ftype,
-        imgtype,
-        readerOutType,
-    )
-    from habana_frameworks.mediapipe.mediapipe import MediaPipe
-    from habana_frameworks.mediapipe.operators.cpu_nodes.cpu_nodes import media_function
-    from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import (
-        media_ext_reader_op_impl,
-        media_ext_reader_op_tensor_info,
-    )
-except ImportError:
-    pass
-
-
-def get_dataset_for_pipeline(img_dir):
-    labels = open(f"{img_dir}/label.txt").readlines()
-    dct = {"image": [], "text": []}
-    for item in sorted(
-        [i for i in os.listdir(img_dir) if "txt" not in i],
-        key=lambda x: int(x.split(".")[0]),
-    ):
-        key = int(item.split(".")[0])
-        dct["image"] += [f"{img_dir}/{item}"]
-        dct["text"] += [labels[key]]
-
-    def gen():
-        for idx in range(len(dct["image"])):
-            yield {"image": dct["image"][idx], "text": dct["text"][idx]}
-
-    return DatasetHF.from_generator(gen)
-
-
-class ReadImageTextFromDataset(media_ext_reader_op_impl):
-    """
-    Class defining read image/text from directory node.
-    """
-
-    def __init__(self, params, fw_params):
-        priv_params = params["priv_params"]
-        self.dataset = priv_params["dataset"]
-
-        self.dataset_image = []
-        self.dataset_prompt_embeds = []
-        self.dataset_pooled_prompt_embeds = []
-        self.dataset_original_sizes = []
-        self.dataset_crop_top_lefts = []
-        for k in self.dataset:
-            self.dataset_image += [k["image"]]
-            self.dataset_prompt_embeds += [k["prompt_embeds"]]
-            self.dataset_pooled_prompt_embeds += [k["pooled_prompt_embeds"]]
-            self.dataset_original_sizes += [k["original_sizes"]]
-            self.dataset_crop_top_lefts += [k["crop_top_lefts"]]
-
-        self.dataset_image = np.array(self.dataset_image)
-        self.dataset_prompt_embeds = np.array(self.dataset_prompt_embeds, dtype=np.float32)
-        self.dataset_pooled_prompt_embeds = np.array(self.dataset_pooled_prompt_embeds, dtype=np.float32)
-        self.dataset_original_sizes = np.array(self.dataset_original_sizes, dtype=np.uint32)
-        self.dataset_crop_top_lefts = np.array(self.dataset_crop_top_lefts, dtype=np.uint32)
-        self.epoch = 0
-        self.batch_sampler = priv_params["batch_sampler"]
-
-        self.num_imgs_slice = len(self.batch_sampler.sampler)
-        self.num_batches_slice = len(self.batch_sampler)
-
-        logger.info("Finding largest file ...")
-        self.max_file = max(self.dataset["image"], key=lambda x: len(x))
-        self.batch_size = fw_params.batch_size
-
-    def gen_output_info(self):
-        out_info = []
-        o = media_ext_reader_op_tensor_info(dtype.NDT, np.array([self.batch_size], dtype=np.uint32), "")
-        out_info.append(o)
-        sample = self.dataset[0]
-        sample["pooled_prompt_embeds"]
-        d0 = len(sample["pooled_prompt_embeds"])
-        d1 = len(sample["prompt_embeds"])
-        d2 = len(sample["prompt_embeds"][0])
-        o = media_ext_reader_op_tensor_info(dtype.FLOAT32, np.array([d2, d1, self.batch_size], dtype=np.uint32), "")
-        out_info.append(o)
-        o = media_ext_reader_op_tensor_info(dtype.FLOAT32, np.array([d0, self.batch_size], dtype=np.uint32), "")
-        out_info.append(o)
-        o = media_ext_reader_op_tensor_info("uint32", np.array([2, self.batch_size], dtype=np.uint32), "")
-        out_info.append(o)
-        o = media_ext_reader_op_tensor_info("uint32", np.array([2, self.batch_size], dtype=np.uint32), "")
-        out_info.append(o)
-        return out_info
-
-    def get_largest_file(self):
-        return self.max_file
-
-    def get_media_output_type(self):
-        return readerOutType.FILE_LIST
-
-    def __len__(self):
-        return self.num_batches_slice
-
-    def __iter__(self):
-        self.iter_loc = 0
-        self.epoch += 1
-        self.batch_sampler.sampler.set_epoch(
-            self.epoch
-        )  # Without this dist sampler will create same batches every epoch
-        self.batch_sampler_iter = iter(self.batch_sampler)
-        return self
-
-    def __next__(self):
-        if self.iter_loc > (self.num_imgs_slice - 1):
-            raise StopIteration
-
-        data_idx = next(self.batch_sampler_iter)
-        img_list = list(self.dataset_image[data_idx])
-        prompt_embeds_np = self.dataset_prompt_embeds[data_idx]
-        pooled_prompt_embeds_np = self.dataset_pooled_prompt_embeds[data_idx]
-        original_sizes = self.dataset_original_sizes[data_idx]
-        crop_top_lefts = self.dataset_crop_top_lefts[data_idx]
-
-        self.iter_loc = self.iter_loc + self.batch_size
-        return (
-            img_list,
-            prompt_embeds_np,
-            pooled_prompt_embeds_np,
-            original_sizes,
-            crop_top_lefts,
-        )
-
-
-class RandomFlipFunction(media_function):
-    """
-    Class to randomly generate input for RandomFlip media node.
-
-    """
-
-    def __init__(self, params):
-        """
-        :params params: random_flip_func specific params.
-                        shape: output shape
-                        dtype: output data type
-                        seed: seed to be used
-        """
-        self.np_shape = params["shape"][::-1]
-        self.np_dtype = params["dtype"]
-        self.seed = params["seed"]
-        self.rng = np.random.default_rng(self.seed)
-
-    def __call__(self):
-        """
-        :returns : randomly generated binary output per image.
-        """
-        probabilities = [1.0 - 0.5, 0.5]
-        random_flips = self.rng.choice([0, 1], p=probabilities, size=self.np_shape)
-        random_flips = np.array(random_flips, dtype=self.np_dtype)
-        return random_flips
-
-
-class SDXLMediaPipe(MediaPipe):
-    """
-    Class defining SDXL media pipe:
-        read data --> image decoding (include crop and resize) --> crop mirror normalize
-
-    Original set of PyTorch transformations:
-        aspect ratio preserving resize -> center crop -> normalize
-    """
-
-    instance_count = 0
-
-    def __init__(
-        self,
-        dataset=None,
-        image_size=512,
-        sampler=None,
-        batch_size=512,
-        drop_last=True,
-        queue_depth=5,
-    ):
-        self.device = "legacy"
-        self.dataset = dataset
-        self.batch_size = batch_size
-
-        self.drop_last = drop_last
-        self.sampler = sampler
-        self.batch_sampler = BatchSampler(self.sampler, batch_size, drop_last)
-
-        self.image_size = image_size
-
-        pipe_name = "{}:{}".format(self.__class__.__name__, SDXLMediaPipe.instance_count)
-        pipe_name = str(pipe_name)
-
-        super(SDXLMediaPipe, self).__init__(
-            device=self.device,
-            batch_size=batch_size,
-            prefetch_depth=queue_depth,
-            pipe_name=pipe_name,
-        )
-
-        priv_params = {}
-        priv_params["dataset"] = self.dataset
-        priv_params["batch_sampler"] = self.batch_sampler
-
-        self.input = fn.MediaExtReaderOp(impl=ReadImageTextFromDataset, num_outputs=5, priv_params=priv_params)
-
-        def_output_image_size = [self.image_size, self.image_size]
-        res_pp_filter = ftype.BI_LINEAR
-        self.decode = fn.ImageDecoder(
-            device="hpu",
-            output_format=imgtype.RGB_P,
-            # random_crop_type=randomCropType.CENTER_CROP,
-            resize=def_output_image_size,
-            resampling_mode=res_pp_filter,
-        )
-        normalize_mean = np.array([255 / 2, 255 / 2, 255 / 2]).astype(np.float32)
-        normalize_std = 1 / (np.array([255 / 2, 255 / 2, 255 / 2]).astype(np.float32))
-        norm_mean = fn.MediaConst(data=normalize_mean, shape=[1, 1, 3], dtype=dtype.FLOAT32)
-        norm_std = fn.MediaConst(data=normalize_std, shape=[1, 1, 3], dtype=dtype.FLOAT32)
-        self.cmn = fn.CropMirrorNorm(
-            crop_w=self.image_size,
-            crop_h=self.image_size,
-            crop_pos_x=0,
-            crop_pos_y=0,
-            crop_d=0,
-            dtype=dtype.FLOAT32,
-        )
-        self.mean = norm_mean()
-        self.std = norm_std()
-
-        self.random_flip_input = fn.MediaFunc(
-            func=RandomFlipFunction,
-            shape=[self.batch_size],
-            dtype=dtype.UINT8,
-            seed=100,
-        )
-        self.random_flip = fn.RandomFlip(horizontal=1)
-
-        SDXLMediaPipe.instance_count += 1
-
-    def definegraph(self):
-        jpegs, prompt_embeds, pooled_prompt_embeds, original_sizes, crop_top_lefts = self.input()
-        images = self.decode(jpegs)
-        flip = self.random_flip_input()
-        images = self.random_flip(images, flip)
-        images = self.cmn(images, self.mean, self.std)
-        return (
-            images,
-            prompt_embeds,
-            pooled_prompt_embeds,
-            original_sizes,
-            crop_top_lefts,
-        )
-
-
-class MediaApiDataLoader(torch.utils.data.DataLoader):
-    def __init__(
-        self,
-        dataset,
-        resolution,
-        batch_size=1,
-    ):
-        self.dataset = dataset
-
-        from habana_frameworks.mediapipe.plugins.iterator_pytorch import (
-            HPUGenericPytorchIterator,
-        )
-
-        try:
-            world_size = get_world_size()
-        except Exception:
-            world_size = 1
-
-        if world_size > 1:
-            process_index = get_rank()
-            self.sampler = DistributedSamplerWithLoop(
-                self.dataset,
-                num_replicas=world_size,
-                rank=process_index,
-                seed=1,
-                batch_size=batch_size,
-            )
-        else:
-            self.sampler = torch.utils.data.sampler.RandomSampler(self.dataset)
-
-        pipeline = SDXLMediaPipe(
-            dataset=dataset,
-            image_size=resolution,
-            sampler=self.sampler,
-            batch_size=batch_size,
-            drop_last=True,
-            queue_depth=5,
-        )
-        self.iterator = HPUGenericPytorchIterator(mediapipe=pipeline)
-        self.epoch = 0
-
-    def __len__(self):
-        return len(self.iterator)
-
-    def __iter__(self):
-        self.iterator.__iter__()
-        self.epoch += 1
-        return self
-
-    def __next__(self):
-        data = next(self.iterator)
-        return {
-            "pixel_values": data[0],
-            "prompt_embeds": data[1],
-            "pooled_prompt_embeds": data[2],
-            "original_sizes": data[3],
-            "crop_top_lefts": data[4],
-        }
diff --git a/examples/stable-diffusion/training/requirements.txt b/examples/stable-diffusion/training/requirements.txt
deleted file mode 100644
index e1d1d1c405..0000000000
--- a/examples/stable-diffusion/training/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-imagesize == 1.4.1
-peft == 0.10.0
diff --git a/examples/stable-diffusion/training/textual_inversion.py b/examples/stable-diffusion/training/textual_inversion.py
deleted file mode 100644
index f968ac808c..0000000000
--- a/examples/stable-diffusion/training/textual_inversion.py
+++ /dev/null
@@ -1,1012 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import argparse
-import json
-import logging
-import math
-import os
-import random
-import shutil
-import time
-import warnings
-from pathlib import Path
-
-import diffusers
-import numpy as np
-import PIL
-import safetensors
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-import transformers
-from accelerate.logging import get_logger
-from accelerate.utils import ProjectConfiguration
-from diffusers import (
-    AutoencoderKL,
-    UNet2DConditionModel,
-)
-from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version, is_wandb_available
-from huggingface_hub import create_repo, upload_folder
-
-# TODO: remove and import from diffusers.utils when the new version of diffusers is released
-from packaging import version
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-
-from optimum.habana import GaudiConfig
-from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
-from optimum.habana.utils import set_seed
-
-
-if is_wandb_available():
-    import wandb
-
-if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.Resampling.BILINEAR,
-        "bilinear": PIL.Image.Resampling.BILINEAR,
-        "bicubic": PIL.Image.Resampling.BICUBIC,
-        "lanczos": PIL.Image.Resampling.LANCZOS,
-        "nearest": PIL.Image.Resampling.NEAREST,
-    }
-else:
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.LINEAR,
-        "bilinear": PIL.Image.BILINEAR,
-        "bicubic": PIL.Image.BICUBIC,
-        "lanczos": PIL.Image.LANCZOS,
-        "nearest": PIL.Image.NEAREST,
-    }
-# ------------------------------------------------------------------------------
-
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
-
-logger = get_logger(__name__)
-
-
-def save_model_card(repo_id: str, images=None, base_model=str, repo_folder=None):
-    img_str = ""
-    for i, image in enumerate(images):
-        image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"![img_{i}](./image_{i}.png)\n"
-
-    yaml = f"""
----
-license: creativeml-openrail-m
-base_model: {base_model}
-tags:
-- stable-diffusion
-- stable-diffusion-diffusers
-- text-to-image
-- diffusers
-- textual_inversion
-inference: true
----
-    """
-    model_card = f"""
-# Textual inversion text2image fine-tuning - {repo_id}
-These are textual inversion adaption weights for {base_model}. You can find some example images in the following. \n
-{img_str}
-"""
-    with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)
-
-
-def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch):
-    logger.info(
-        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-        f" {args.validation_prompt}."
-    )
-    # create pipeline (note: unet and vae are loaded again in float32)
-    pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-        args.pretrained_model_name_or_path,
-        text_encoder=accelerator.unwrap_model(text_encoder),
-        tokenizer=tokenizer,
-        unet=unet,
-        vae=vae,
-        safety_checker=None,
-        revision=args.revision,
-        variant=args.variant,
-        use_habana=True,
-        use_hpu_graphs=True,
-        gaudi_config=args.gaudi_config_name,
-    )
-    pipeline.scheduler = GaudiDDIMScheduler.from_config(pipeline.scheduler.config)
-    pipeline.set_progress_bar_config(disable=True)
-
-    # run inference
-    generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed)
-    images = []
-    for _ in range(args.num_validation_images):
-        image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
-        images.append(image)
-
-    for tracker in accelerator.trackers:
-        if tracker.name == "tensorboard":
-            np_images = np.stack([np.asarray(img) for img in images])
-            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
-        if tracker.name == "wandb":
-            tracker.log(
-                {
-                    "validation": [
-                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
-                    ]
-                }
-            )
-
-    del pipeline
-    return images
-
-
-def save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path, safe_serialization=True):
-    logger.info("Saving embeddings")
-    learned_embeds = (
-        accelerator.unwrap_model(text_encoder)
-        .get_input_embeddings()
-        .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1]
-    )
-    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
-
-    if safe_serialization:
-        safetensors.torch.save_file(learned_embeds_dict, save_path, metadata={"format": "pt"})
-    else:
-        torch.save(learned_embeds_dict, save_path)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--save_steps",
-        type=int,
-        default=500,
-        help="Save learned_embeds.bin every X updates steps.",
-    )
-    parser.add_argument(
-        "--save_as_full_pipeline",
-        action="store_true",
-        help="Save the complete stable diffusion pipeline.",
-    )
-    parser.add_argument(
-        "--num_vectors",
-        type=int,
-        default=1,
-        help="How many textual inversion vectors shall be used to learn the concept.",
-    )
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--variant",
-        type=str,
-        default=None,
-        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
-    )
-    parser.add_argument(
-        "--placeholder_token",
-        type=str,
-        default=None,
-        required=True,
-        help="A token to use as a placeholder for the concept.",
-    )
-    parser.add_argument(
-        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
-    )
-    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
-    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="text-inversion-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=100)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=1e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        default=False,
-        help=("Whether to use bf16 mixed precision."),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="tensorboard",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
-            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
-        ),
-    )
-    parser.add_argument(
-        "--validation_prompt",
-        type=str,
-        default=None,
-        help="A prompt that is used during validation to verify that the model is learning.",
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images that should be generated during validation with `validation_prompt`.",
-    )
-    parser.add_argument(
-        "--validation_steps",
-        type=int,
-        default=100,
-        help=(
-            "Run validation every X steps. Validation consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`"
-            " and logging the images."
-        ),
-    )
-    parser.add_argument(
-        "--validation_epochs",
-        type=int,
-        default=None,
-        help=(
-            "Deprecated in favor of validation_steps. Run validation every X epochs. Validation consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`"
-            " and logging the images."
-        ),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=(
-            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
-            " training using `--resume_from_checkpoint`."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoints_total_limit",
-        type=int,
-        default=None,
-        help=("Max number of checkpoints to store."),
-    )
-    parser.add_argument(
-        "--resume_from_checkpoint",
-        type=str,
-        default=None,
-        help=(
-            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
-            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
-        ),
-    )
-    parser.add_argument(
-        "--no_safe_serialization",
-        action="store_true",
-        help="If specified save the checkpoint not in `safetensors` format, but in original PyTorch format instead.",
-    )
-    parser.add_argument(
-        "--gaudi_config_name",
-        type=str,
-        default=None,
-        help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.",
-    )
-    parser.add_argument(
-        "--throughput_warmup_steps",
-        type=int,
-        default=0,
-        help=(
-            "Number of steps to ignore for throughput calculation. For example, with throughput_warmup_steps=N, the"
-            " first N steps will not be considered in the calculation of the throughput. This is especially useful in"
-            " lazy mode."
-        ),
-    )
-
-    args = parser.parse_args()
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.train_data_dir is None:
-        raise ValueError("You must specify a train data directory.")
-
-    return args
-
-
-imagenet_templates_small = [
-    "a photo of a {}",
-    "a rendering of a {}",
-    "a cropped photo of the {}",
-    "the photo of a {}",
-    "a photo of a clean {}",
-    "a photo of a dirty {}",
-    "a dark photo of the {}",
-    "a photo of my {}",
-    "a photo of the cool {}",
-    "a close-up photo of a {}",
-    "a bright photo of the {}",
-    "a cropped photo of a {}",
-    "a photo of the {}",
-    "a good photo of the {}",
-    "a photo of one {}",
-    "a close-up photo of the {}",
-    "a rendition of the {}",
-    "a photo of the clean {}",
-    "a rendition of a {}",
-    "a photo of a nice {}",
-    "a good photo of a {}",
-    "a photo of the nice {}",
-    "a photo of the small {}",
-    "a photo of the weird {}",
-    "a photo of the large {}",
-    "a photo of a cool {}",
-    "a photo of a small {}",
-]
-
-imagenet_style_templates_small = [
-    "a painting in the style of {}",
-    "a rendering in the style of {}",
-    "a cropped painting in the style of {}",
-    "the painting in the style of {}",
-    "a clean painting in the style of {}",
-    "a dirty painting in the style of {}",
-    "a dark painting in the style of {}",
-    "a picture in the style of {}",
-    "a cool painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a bright painting in the style of {}",
-    "a cropped painting in the style of {}",
-    "a good painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a rendition in the style of {}",
-    "a nice painting in the style of {}",
-    "a small painting in the style of {}",
-    "a weird painting in the style of {}",
-    "a large painting in the style of {}",
-]
-
-
-class TextualInversionDataset(Dataset):
-    def __init__(
-        self,
-        data_root,
-        tokenizer,
-        learnable_property="object",  # [object, style]
-        size=512,
-        repeats=100,
-        interpolation="bicubic",
-        flip_p=0.5,
-        set="train",
-        placeholder_token="*",
-        center_crop=False,
-    ):
-        self.data_root = data_root
-        self.tokenizer = tokenizer
-        self.learnable_property = learnable_property
-        self.size = size
-        self.placeholder_token = placeholder_token
-        self.center_crop = center_crop
-        self.flip_p = flip_p
-
-        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
-        self.image_paths = [file for file in self.image_paths[:] if os.path.isfile(file)]
-
-        self.num_images = len(self.image_paths)
-        self._length = self.num_images
-
-        if set == "train":
-            self._length = self.num_images * repeats
-
-        self.interpolation = {
-            "linear": PIL_INTERPOLATION["linear"],
-            "bilinear": PIL_INTERPOLATION["bilinear"],
-            "bicubic": PIL_INTERPOLATION["bicubic"],
-            "lanczos": PIL_INTERPOLATION["lanczos"],
-        }[interpolation]
-
-        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
-        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, i):
-        example = {}
-        image = Image.open(self.image_paths[i % self.num_images])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        placeholder_string = self.placeholder_token
-        text = random.choice(self.templates).format(placeholder_string)
-
-        example["input_ids"] = self.tokenizer(
-            text,
-            padding="max_length",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids[0]
-
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-
-        if self.center_crop:
-            crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
-                img.shape[0],
-                img.shape[1],
-            )
-            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
-
-        image = Image.fromarray(img)
-        image = image.resize((self.size, self.size), resample=self.interpolation)
-
-        image = self.flip_transform(image)
-        image = np.array(image).astype(np.uint8)
-        image = (image / 127.5 - 1.0).astype(np.float32)
-
-        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
-        return example
-
-
-def main():
-    args = parse_args()
-    logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
-
-    gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name)
-
-    accelerator = GaudiAccelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision="bf16" if gaudi_config.use_torch_autocast or args.bf16 else "no",
-        log_with=args.report_to,
-        project_config=accelerator_project_config,
-        force_autocast=gaudi_config.use_torch_autocast or args.bf16,
-    )
-
-    if args.report_to == "wandb":
-        if not is_wandb_available():
-            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        transformers.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        transformers.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-
-    import habana_frameworks.torch.core as htcore
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-        if args.push_to_hub:
-            repo_id = create_repo(
-                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
-            ).repo_id
-
-    # Load tokenizer
-    if args.tokenizer_name:
-        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
-
-    # Load scheduler and models
-    noise_scheduler = GaudiDDIMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = CLIPTextModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
-    ).to(accelerator.device)
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
-    )
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
-    )
-
-    # Add the placeholder token in tokenizer
-    placeholder_tokens = [args.placeholder_token]
-
-    if args.num_vectors < 1:
-        raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}")
-
-    # add dummy tokens for multi-vector
-    additional_tokens = []
-    for i in range(1, args.num_vectors):
-        additional_tokens.append(f"{args.placeholder_token}_{i}")
-    placeholder_tokens += additional_tokens
-
-    num_added_tokens = tokenizer.add_tokens(placeholder_tokens)
-    if num_added_tokens != args.num_vectors:
-        raise ValueError(
-            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
-            " `placeholder_token` that is not already in the tokenizer."
-        )
-
-    # Convert the initializer_token, placeholder_token to ids
-    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
-    # Check if initializer_token is a single token or a sequence of tokens
-    if len(token_ids) > 1:
-        raise ValueError("The initializer token must be a single token.")
-
-    initializer_token_id = token_ids[0]
-    placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens)
-
-    # Resize the token embeddings as we are adding new special tokens to the tokenizer
-    text_encoder.resize_token_embeddings(len(tokenizer))
-
-    # Initialise the newly added placeholder token with the embeddings of the initializer token
-    token_embeds = text_encoder.get_input_embeddings().weight.data
-    with torch.no_grad():
-        for token_id in placeholder_token_ids:
-            token_embeds[token_id] = token_embeds[initializer_token_id].clone()
-
-    # Freeze vae and unet
-    vae.requires_grad_(False)
-    unet.requires_grad_(False)
-    # Freeze all parameters except for the token embeddings in text encoder
-    text_encoder.text_model.encoder.requires_grad_(False)
-    text_encoder.text_model.final_layer_norm.requires_grad_(False)
-    text_encoder.text_model.embeddings.position_embedding.requires_grad_(False)
-
-    if args.gradient_checkpointing:
-        # Keep unet in train mode if we are using gradient checkpointing to save memory.
-        # The dropout cannot be != 0 so it doesn't matter if we are in eval or train mode.
-        unet.train()
-        text_encoder.gradient_checkpointing_enable()
-        unet.enable_gradient_checkpointing()
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
-        )
-
-    # Initialize the optimizer
-    if gaudi_config.use_fused_adam:
-        from habana_frameworks.torch.hpex.optimizers import FusedAdamW
-
-        optimizer_cls = FusedAdamW
-    else:
-        optimizer_cls = torch.optim.AdamW
-    optimizer = optimizer_cls(
-        text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
-        lr=args.learning_rate,
-        betas=(args.adam_beta1, args.adam_beta2),
-        weight_decay=args.adam_weight_decay,
-        eps=args.adam_epsilon,
-    )
-
-    # Dataset and DataLoaders creation:
-    train_dataset = TextualInversionDataset(
-        data_root=args.train_data_dir,
-        tokenizer=tokenizer,
-        size=args.resolution,
-        placeholder_token=(" ".join(tokenizer.convert_ids_to_tokens(placeholder_token_ids))),
-        repeats=args.repeats,
-        learnable_property=args.learnable_property,
-        center_crop=args.center_crop,
-        set="train",
-    )
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
-    )
-    if args.validation_epochs is not None:
-        warnings.warn(
-            f"FutureWarning: You are doing logging with validation_epochs={args.validation_epochs}."
-            " Deprecated validation_epochs in favor of `validation_steps`"
-            f"Setting `args.validation_steps` to {args.validation_epochs * len(train_dataset)}",
-            FutureWarning,
-            stacklevel=2,
-        )
-        args.validation_steps = args.validation_epochs * len(train_dataset)
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps * accelerator.num_processes,
-        num_cycles=args.lr_num_cycles,
-    )
-
-    text_encoder.train()
-    # Prepare everything with our `accelerator`.
-    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        text_encoder, optimizer, train_dataloader, lr_scheduler
-    )
-
-    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
-    # as these weights are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if gaudi_config.use_torch_autocast or args.bf16:
-        weight_dtype = torch.bfloat16
-
-    # Move vae and unet to device and cast to weight_dtype
-    unet.to(accelerator.device, dtype=weight_dtype)
-    vae.to(accelerator.device, dtype=weight_dtype)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        accelerator.init_trackers("textual_inversion", config=vars(args))
-
-    # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    global_step = 0
-    first_epoch = 0
-    # Potentially load in the weights and states from a previous save
-    if args.resume_from_checkpoint:
-        if args.resume_from_checkpoint != "latest":
-            path = os.path.basename(args.resume_from_checkpoint)
-        else:
-            # Get the most recent checkpoint
-            dirs = os.listdir(args.output_dir)
-            dirs = [d for d in dirs if d.startswith("checkpoint")]
-            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
-
-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-            initial_global_step = 0
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            initial_global_step = global_step
-            first_epoch = global_step // num_update_steps_per_epoch
-
-    else:
-        initial_global_step = 0
-
-    progress_bar = tqdm(
-        range(0, args.max_train_steps),
-        initial=initial_global_step,
-        desc="Steps",
-        # Only show the progress bar once on each machine.
-        disable=not accelerator.is_local_main_process,
-    )
-
-    # keep original embeddings as reference
-    orig_embeds_params = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight.data.clone()
-
-    t0 = None
-
-    for epoch in range(first_epoch, args.num_train_epochs):
-        text_encoder.train()
-        for step, batch in enumerate(train_dataloader):
-            if t0 is None and global_step == args.throughput_warmup_steps:
-                t0 = time.perf_counter()
-
-            with accelerator.accumulate(text_encoder):
-                # Convert images to latent space
-                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
-                latents = latents * vae.config.scaling_factor
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
-                bsz = latents.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-                timesteps = timesteps.long()
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(dtype=weight_dtype)
-
-                # Predict the noise residual
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-                accelerator.backward(loss)
-                htcore.mark_step()
-
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad(set_to_none=True)
-                htcore.mark_step()
-
-                # Let's make sure we don't update any embedding weights besides the newly added token
-                index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool)
-                index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
-
-                with torch.no_grad():
-                    accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[index_no_updates] = (
-                        orig_embeds_params[index_no_updates]
-                    )
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                images = []
-                progress_bar.update(1)
-                global_step += 1
-                if global_step % args.save_steps == 0:
-                    weight_name = (
-                        f"learned_embeds-steps-{global_step}.bin"
-                        if args.no_safe_serialization
-                        else f"learned_embeds-steps-{global_step}.safetensors"
-                    )
-                    save_path = os.path.join(args.output_dir, weight_name)
-                    save_progress(
-                        text_encoder,
-                        placeholder_token_ids,
-                        accelerator,
-                        args,
-                        save_path,
-                        safe_serialization=not args.no_safe_serialization,
-                    )
-
-                if accelerator.is_main_process:
-                    if global_step % args.checkpointing_steps == 0:
-                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
-                        if args.checkpoints_total_limit is not None:
-                            checkpoints = os.listdir(args.output_dir)
-                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
-                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
-
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
-                            if len(checkpoints) >= args.checkpoints_total_limit:
-                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
-                                removing_checkpoints = checkpoints[0:num_to_remove]
-
-                                logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
-                                )
-                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
-
-                                for removing_checkpoint in removing_checkpoints:
-                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
-                                    shutil.rmtree(removing_checkpoint)
-
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
-                        logger.info(f"Saved state to {save_path}")
-
-                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
-                        images = log_validation(
-                            text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch
-                        )
-
-            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-            accelerator.log(logs, step=global_step)
-
-            if global_step >= args.max_train_steps:
-                break
-
-    duration = time.perf_counter() - t0
-    throughput = (args.max_train_steps - args.throughput_warmup_steps) * total_batch_size / duration
-
-    # Create the pipeline using the trained modules and save it.
-    accelerator.wait_for_everyone()
-    if accelerator.is_main_process:
-        logger.info(f"Throughput = {throughput} samples/s")
-        logger.info(f"Train runtime = {duration} seconds")
-        metrics = {
-            "train_samples_per_second": throughput,
-            "train_runtime": duration,
-        }
-        with open(f"{args.output_dir}/speed_metrics.json", mode="w") as file:
-            json.dump(metrics, file)
-        if args.push_to_hub and not args.save_as_full_pipeline:
-            logger.warning("Enabling full model saving because --push_to_hub=True was specified.")
-            save_full_model = True
-        else:
-            save_full_model = args.save_as_full_pipeline
-        if save_full_model:
-            pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path,
-                text_encoder=accelerator.unwrap_model(text_encoder),
-                vae=vae,
-                unet=unet,
-                tokenizer=tokenizer,
-                scheduler=noise_scheduler,
-            )
-            pipeline.save_pretrained(args.output_dir)
-        # Save the newly trained embeddings
-        weight_name = "learned_embeds.bin" if args.no_safe_serialization else "learned_embeds.safetensors"
-        save_path = os.path.join(args.output_dir, weight_name)
-        save_progress(
-            text_encoder,
-            placeholder_token_ids,
-            accelerator,
-            args,
-            save_path,
-            safe_serialization=not args.no_safe_serialization,
-        )
-
-        if args.push_to_hub:
-            save_model_card(
-                repo_id,
-                images=images,
-                base_model=args.pretrained_model_name_or_path,
-                repo_folder=args.output_dir,
-            )
-            upload_folder(
-                repo_id=repo_id,
-                folder_path=args.output_dir,
-                commit_message="End of training",
-                ignore_patterns=["step_*", "epoch_*"],
-            )
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/stable-diffusion/training/train_controlnet.py b/examples/stable-diffusion/training/train_controlnet.py
deleted file mode 100644
index 0dd6a0102b..0000000000
--- a/examples/stable-diffusion/training/train_controlnet.py
+++ /dev/null
@@ -1,1172 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-"""
-Training script for Conditional Control to Text-to-Image Diffusion Models
-Adapted from the following source:
-https://github.com/huggingface/diffusers/blob/v0.26.3/examples/controlnet/train_controlnet.py
-"""
-
-import argparse
-import json
-import logging
-import math
-import os
-import random
-import shutil
-import time
-from pathlib import Path
-
-import diffusers
-import habana_frameworks.torch.core as htcore
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-import transformers
-from accelerate.logging import get_logger
-from accelerate.utils import ProjectConfiguration
-from datasets import load_dataset
-from diffusers import (
-    AutoencoderKL,
-    ControlNetModel,
-    UNet2DConditionModel,
-    UniPCMultistepScheduler,
-)
-from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version, is_wandb_available
-from diffusers.utils.torch_utils import is_compiled_module
-from huggingface_hub import create_repo, upload_folder
-from PIL import Image
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-from optimum.habana import GaudiConfig
-from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionControlNetPipeline
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.10.0")
-if is_wandb_available():
-    import wandb
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
-
-logger = get_logger(__name__)
-
-
-def image_grid(imgs, rows, cols):
-    assert len(imgs) == rows * cols
-
-    w, h = imgs[0].size
-    grid = Image.new("RGB", size=(cols * w, rows * h))
-
-    for i, img in enumerate(imgs):
-        grid.paste(img, box=(i % cols * w, i // cols * h))
-    return grid
-
-
-def log_validation(
-    vae,
-    text_encoder,
-    tokenizer,
-    unet,
-    controlnet,
-    args,
-    accelerator,
-    noise_scheduler,
-    weight_dtype,
-    step,
-    gaudi_config,
-):
-    logger.info("Running validation... ")
-
-    controlnet = accelerator.unwrap_model(controlnet)
-
-    pipeline = GaudiStableDiffusionControlNetPipeline.from_pretrained(
-        args.pretrained_model_name_or_path,
-        vae=vae,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        unet=unet,
-        controlnet=controlnet,
-        safety_checker=None,
-        revision=args.revision,
-        variant=args.variant,
-        scheduler=noise_scheduler,
-        use_habana=True,
-        use_hpu_graphs=args.use_hpu_graphs,
-        gaudi_config=gaudi_config,
-    )
-    pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
-    pipeline = pipeline.to(accelerator.device)
-    pipeline.set_progress_bar_config(disable=True)
-
-    if args.seed is None:
-        generator = None
-    elif accelerator.device == torch.device("hpu"):
-        # torch.Generator() is unsupported on HPU
-        generator = set_seed(args.seed)
-    else:
-        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
-
-    if len(args.validation_image) == len(args.validation_prompt):
-        validation_images = args.validation_image
-        validation_prompts = args.validation_prompt
-    elif len(args.validation_image) == 1:
-        validation_images = args.validation_image * len(args.validation_prompt)
-        validation_prompts = args.validation_prompt
-    elif len(args.validation_prompt) == 1:
-        validation_images = args.validation_image
-        validation_prompts = args.validation_prompt * len(args.validation_image)
-    else:
-        raise ValueError(
-            "number of `args.validation_image` and `args.validation_prompt` should be checked in `parse_args`"
-        )
-
-    image_logs = []
-
-    for validation_prompt, validation_image in zip(validation_prompts, validation_images):
-        validation_image = Image.open(validation_image).convert("RGB")
-
-        images = []
-
-        for _ in range(args.num_validation_images):
-            with torch.autocast(device_type="hpu", dtype=weight_dtype, enabled=gaudi_config.use_torch_autocast):
-                image = pipeline(
-                    validation_prompt, validation_image, num_inference_steps=20, generator=generator
-                ).images[0]
-            images.append(image)
-
-        image_logs.append(
-            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
-        )
-
-    for tracker in accelerator.trackers:
-        if tracker.name == "tensorboard":
-            for log in image_logs:
-                images = log["images"]
-                validation_prompt = log["validation_prompt"]
-                validation_image = log["validation_image"]
-
-                formatted_images = []
-
-                formatted_images.append(np.asarray(validation_image))
-
-                for image in images:
-                    formatted_images.append(np.asarray(image))
-
-                formatted_images = np.stack(formatted_images)
-
-                tracker.writer.add_images(validation_prompt, formatted_images, step, dataformats="NHWC")
-        elif tracker.name == "wandb":
-            formatted_images = []
-
-            for log in image_logs:
-                images = log["images"]
-                validation_prompt = log["validation_prompt"]
-                validation_image = log["validation_image"]
-
-                formatted_images.append(wandb.Image(validation_image, caption="Controlnet conditioning"))
-
-                for image in images:
-                    image = wandb.Image(image, caption=validation_prompt)
-                    formatted_images.append(image)
-
-            tracker.log({"validation": formatted_images})
-        else:
-            logger.warn(f"image logging not implemented for {tracker.name}")
-
-        return image_logs
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=revision,
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
-
-        return RobertaSeriesModelWithTransformation
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=None):
-    img_str = ""
-    if image_logs is not None:
-        img_str = "You can find some example images below.\n"
-        for i, log in enumerate(image_logs):
-            images = log["images"]
-            validation_prompt = log["validation_prompt"]
-            validation_image = log["validation_image"]
-            validation_image.save(os.path.join(repo_folder, "image_control.png"))
-            img_str += f"prompt: {validation_prompt}\n"
-            images = [validation_image] + images
-            image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
-            img_str += f"![images_{i})](./images_{i}.png)\n"
-
-    yaml = f"""
----
-license: creativeml-openrail-m
-base_model: {base_model}
-tags:
-- stable-diffusion
-- stable-diffusion-diffusers
-- text-to-image
-- diffusers
-- controlnet
-inference: true
----
-    """
-    model_card = f"""
-# controlnet-{repo_id}
-
-These are controlnet weights trained on {base_model} with new type of conditioning.
-{img_str}
-"""
-    with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--controlnet_model_name_or_path",
-        type=str,
-        default=None,
-        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
-        " If not specified controlnet weights are initialized from unet.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--variant",
-        type=str,
-        default=None,
-        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="controlnet-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        type=str,
-        default=None,
-        help="The directory where the downloaded models and datasets will be stored.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=(
-            "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
-            "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
-            "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
-            "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
-            "instructions."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoints_total_limit",
-        type=int,
-        default=None,
-        help=("Max number of checkpoints to store."),
-    )
-    parser.add_argument(
-        "--resume_from_checkpoint",
-        type=str,
-        default=None,
-        help=(
-            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
-            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
-        ),
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-6,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument(
-        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
-    )
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="tensorboard",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
-            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
-        ),
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        default=False,
-        help=("Whether to use bf16 mixed precision."),
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help=(
-            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
-            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand."
-        ),
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The config of the Dataset, leave as None if there's only one config.",
-    )
-    parser.add_argument(
-        "--train_data_dir",
-        type=str,
-        default=None,
-        help=(
-            "A folder containing the training data. Folder contents must follow the structure described in"
-            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
-            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ),
-    )
-    parser.add_argument(
-        "--image_column", type=str, default="image", help="The column of the dataset containing the target image."
-    )
-    parser.add_argument(
-        "--conditioning_image_column",
-        type=str,
-        default="conditioning_image",
-        help="The column of the dataset containing the controlnet conditioning image.",
-    )
-    parser.add_argument(
-        "--caption_column",
-        type=str,
-        default="text",
-        help="The column of the dataset containing a caption or a list of captions.",
-    )
-    parser.add_argument(
-        "--max_train_samples",
-        type=int,
-        default=None,
-        help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        ),
-    )
-    parser.add_argument(
-        "--proportion_empty_prompts",
-        type=float,
-        default=0,
-        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
-    )
-    parser.add_argument(
-        "--validation_prompt",
-        type=str,
-        default=None,
-        nargs="+",
-        help=(
-            "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`."
-            " Provide either a matching number of `--validation_image`s, a single `--validation_image`"
-            " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s."
-        ),
-    )
-    parser.add_argument(
-        "--validation_image",
-        type=str,
-        default=None,
-        nargs="+",
-        help=(
-            "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`"
-            " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a"
-            " a single `--validation_prompt` to be used with all `--validation_image`s, or a single"
-            " `--validation_image` that will be used with all `--validation_prompt`s."
-        ),
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair",
-    )
-    parser.add_argument(
-        "--validation_steps",
-        type=int,
-        default=100,
-        help=(
-            "Run validation every X steps. Validation consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`"
-            " and logging the images."
-        ),
-    )
-    parser.add_argument(
-        "--tracker_project_name",
-        type=str,
-        default="train_controlnet",
-        help=(
-            "The `project_name` argument passed to Accelerator.init_trackers for"
-            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
-        ),
-    )
-    (
-        parser.add_argument(
-            "--gaudi_config_name",
-            type=str,
-            default="Habana/stable-diffusion",
-            help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.",
-        ),
-    )
-    parser.add_argument(
-        "--throughput_warmup_steps",
-        type=int,
-        default=0,
-        help=(
-            "Number of steps to ignore for throughput calculation. For example, with throughput_warmup_steps=N, the"
-            " first N steps will not be considered in the calculation of the throughput. This is especially useful in"
-            " lazy mode."
-        ),
-    )
-    parser.add_argument("--use_hpu_graphs", action="store_true", help="Use HPU graphs on HPU.")
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError("Specify either `--dataset_name` or `--train_data_dir`")
-
-    if args.dataset_name is not None and args.train_data_dir is not None:
-        raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`")
-
-    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
-        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
-
-    if args.validation_prompt is not None and args.validation_image is None:
-        raise ValueError("`--validation_image` must be set if `--validation_prompt` is set")
-
-    if args.validation_prompt is None and args.validation_image is not None:
-        raise ValueError("`--validation_prompt` must be set if `--validation_image` is set")
-
-    if (
-        args.validation_image is not None
-        and args.validation_prompt is not None
-        and len(args.validation_image) != 1
-        and len(args.validation_prompt) != 1
-        and len(args.validation_image) != len(args.validation_prompt)
-    ):
-        raise ValueError(
-            "Must provide either 1 `--validation_image`, 1 `--validation_prompt`,"
-            " or the same number of `--validation_prompt`s and `--validation_image`s"
-        )
-
-    if args.resolution % 8 != 0:
-        raise ValueError(
-            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
-        )
-
-    return args
-
-
-def make_train_dataset(args, tokenizer, accelerator):
-    # Get the datasets: you can either provide your own training and evaluation files (see below)
-    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
-
-    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(
-            args.dataset_name,
-            args.dataset_config_name,
-            cache_dir=args.cache_dir,
-        )
-    else:
-        if args.train_data_dir is not None:
-            dataset = load_dataset(
-                args.train_data_dir,
-                cache_dir=args.cache_dir,
-            )
-        # See more about loading custom images at
-        # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    column_names = dataset["train"].column_names
-
-    # 6. Get the column names for input/target.
-    if args.image_column is None:
-        image_column = column_names[0]
-        logger.info(f"image column defaulting to {image_column}")
-    else:
-        image_column = args.image_column
-        if image_column not in column_names:
-            raise ValueError(
-                f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
-            )
-
-    if args.caption_column is None:
-        caption_column = column_names[1]
-        logger.info(f"caption column defaulting to {caption_column}")
-    else:
-        caption_column = args.caption_column
-        if caption_column not in column_names:
-            raise ValueError(
-                f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
-            )
-
-    if args.conditioning_image_column is None:
-        conditioning_image_column = column_names[2]
-        logger.info(f"conditioning image column defaulting to {conditioning_image_column}")
-    else:
-        conditioning_image_column = args.conditioning_image_column
-        if conditioning_image_column not in column_names:
-            raise ValueError(
-                f"`--conditioning_image_column` value '{args.conditioning_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
-            )
-
-    def tokenize_captions(examples, is_train=True):
-        captions = []
-        for caption in examples[caption_column]:
-            if random.random() < args.proportion_empty_prompts:
-                captions.append("")
-            elif isinstance(caption, str):
-                captions.append(caption)
-            elif isinstance(caption, (list, np.ndarray)):
-                # take a random caption if there are multiple
-                captions.append(random.choice(caption) if is_train else caption[0])
-            else:
-                raise ValueError(
-                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
-                )
-        inputs = tokenizer(
-            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        return inputs.input_ids
-
-    image_transforms = transforms.Compose(
-        [
-            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.CenterCrop(args.resolution),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ]
-    )
-
-    conditioning_image_transforms = transforms.Compose(
-        [
-            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.CenterCrop(args.resolution),
-            transforms.ToTensor(),
-        ]
-    )
-
-    def preprocess_train(examples):
-        images = [image.convert("RGB") for image in examples[image_column]]
-        images = [image_transforms(image) for image in images]
-
-        conditioning_images = [image.convert("RGB") for image in examples[conditioning_image_column]]
-        conditioning_images = [conditioning_image_transforms(image) for image in conditioning_images]
-
-        examples["pixel_values"] = images
-        examples["conditioning_pixel_values"] = conditioning_images
-        examples["input_ids"] = tokenize_captions(examples)
-
-        return examples
-
-    with accelerator.main_process_first():
-        if args.max_train_samples is not None:
-            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
-        # Set the training transforms
-        train_dataset = dataset["train"].with_transform(preprocess_train)
-
-    return train_dataset
-
-
-def collate_fn(examples):
-    pixel_values = torch.stack([example["pixel_values"] for example in examples])
-    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-
-    conditioning_pixel_values = torch.stack([example["conditioning_pixel_values"] for example in examples])
-    conditioning_pixel_values = conditioning_pixel_values.to(memory_format=torch.contiguous_format).float()
-
-    input_ids = torch.stack([example["input_ids"] for example in examples])
-
-    return {
-        "pixel_values": pixel_values,
-        "conditioning_pixel_values": conditioning_pixel_values,
-        "input_ids": input_ids,
-    }
-
-
-def main(args):
-    logging_dir = Path(args.output_dir, args.logging_dir)
-
-    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
-    gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name)
-
-    # Set autocast to True for --bf16
-    if args.bf16:
-        gaudi_config.use_torch_autocast = True
-    accelerator = GaudiAccelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision="bf16" if gaudi_config.use_torch_autocast else "no",
-        log_with=args.report_to,
-        project_config=accelerator_project_config,
-        force_autocast=gaudi_config.use_torch_autocast,
-    )
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        transformers.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        transformers.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-        if args.push_to_hub:
-            repo_id = create_repo(
-                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
-            ).repo_id
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.pretrained_model_name_or_path,
-            subfolder="tokenizer",
-            revision=args.revision,
-            use_fast=False,
-        )
-
-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
-
-    # Load scheduler and models
-    noise_scheduler = GaudiDDIMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = text_encoder_cls.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
-    )
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
-    )
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
-    )
-
-    if args.controlnet_model_name_or_path:
-        logger.info("Loading existing controlnet weights")
-        controlnet = ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
-    else:
-        logger.info("Initializing controlnet weights from unet")
-        controlnet = ControlNetModel.from_unet(unet)
-
-    # Taken from [Sayak Paul's Diffusers PR #6511](https://github.com/huggingface/diffusers/pull/6511/files)
-    def unwrap_model(model):
-        model = accelerator.unwrap_model(model)
-        model = model._orig_mod if is_compiled_module(model) else model
-        return model
-
-    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
-    def save_model_hook(models, weights, output_dir):
-        if accelerator.is_main_process:
-            i = len(weights) - 1
-
-            while len(weights) > 0:
-                weights.pop()
-                model = models[i]
-
-                sub_dir = "controlnet"
-                model.save_pretrained(os.path.join(output_dir, sub_dir))
-
-                i -= 1
-
-    def load_model_hook(models, input_dir):
-        while len(models) > 0:
-            # pop models so that they are not loaded again
-            model = models.pop()
-
-            # load diffusers style into model
-            load_model = ControlNetModel.from_pretrained(input_dir, subfolder="controlnet")
-            model.register_to_config(**load_model.config)
-
-            model.load_state_dict(load_model.state_dict())
-            del load_model
-
-    accelerator.register_save_state_pre_hook(save_model_hook)
-    accelerator.register_load_state_pre_hook(load_model_hook)
-
-    vae.requires_grad_(False)
-    unet.requires_grad_(False)
-    text_encoder.requires_grad_(False)
-    controlnet.train()
-
-    if args.gradient_checkpointing:
-        controlnet.enable_gradient_checkpointing()
-
-    # Check that all trainable models are in full precision
-    low_precision_error_string = (
-        " Please make sure to always have all model weights in full float32 precision when starting training - even if"
-        " doing mixed precision training, copy of the weights should still be float32."
-    )
-
-    if unwrap_model(controlnet).dtype != torch.float32:
-        raise ValueError(
-            f"Controlnet loaded as datatype {unwrap_model(controlnet).dtype}. {low_precision_error_string}"
-        )
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
-        )
-
-    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
-    if args.use_8bit_adam:
-        try:
-            import bitsandbytes as bnb
-        except ImportError:
-            raise ImportError(
-                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-            )
-
-        optimizer_class = bnb.optim.AdamW8bit
-    elif gaudi_config.use_fused_adam:
-        from habana_frameworks.torch.hpex.optimizers import FusedAdamW
-
-        optimizer_class = FusedAdamW
-    else:
-        optimizer_class = torch.optim.AdamW
-
-    # Optimizer creation
-    params_to_optimize = controlnet.parameters()
-    optimizer = optimizer_class(
-        params_to_optimize,
-        lr=args.learning_rate,
-        betas=(args.adam_beta1, args.adam_beta2),
-        weight_decay=args.adam_weight_decay,
-        eps=args.adam_epsilon,
-    )
-
-    train_dataset = make_train_dataset(args, tokenizer, accelerator)
-
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset,
-        shuffle=True,
-        collate_fn=collate_fn,
-        batch_size=args.train_batch_size,
-        num_workers=args.dataloader_num_workers,
-    )
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps * accelerator.num_processes,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if gaudi_config.use_torch_autocast:
-        weight_dtype = torch.bfloat16
-
-    # Move controlnet to device prior to calling prepare()
-    controlnet.to(accelerator.device, dtype=weight_dtype)
-    # Prepare everything with our `accelerator`.
-    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        controlnet, optimizer, train_dataloader, lr_scheduler
-    )
-
-    # Move vae, unet and text_encoder to device and cast to weight_dtype
-    vae.to(accelerator.device, dtype=weight_dtype)
-    unet.to(accelerator.device, dtype=weight_dtype)
-    text_encoder.to(accelerator.device, dtype=weight_dtype)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        tracker_config = dict(vars(args))
-
-        # tensorboard cannot handle list types for config
-        tracker_config.pop("validation_prompt")
-        tracker_config.pop("validation_image")
-
-        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
-
-    # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    global_step = 0
-    first_epoch = 0
-
-    # Potentially load in the weights and states from a previous save
-    if args.resume_from_checkpoint:
-        if args.resume_from_checkpoint != "latest":
-            path = os.path.basename(args.resume_from_checkpoint)
-        else:
-            # Get the most recent checkpoint
-            dirs = os.listdir(args.output_dir)
-            dirs = [d for d in dirs if d.startswith("checkpoint")]
-            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
-
-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-            initial_global_step = 0
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            initial_global_step = global_step
-            first_epoch = global_step // num_update_steps_per_epoch
-    else:
-        initial_global_step = 0
-
-    progress_bar = tqdm(
-        range(0, args.max_train_steps),
-        initial=initial_global_step,
-        desc="Steps",
-        # Only show the progress bar once on each machine.
-        disable=not accelerator.is_local_main_process,
-    )
-
-    image_logs = None
-    t0 = None
-    for epoch in range(first_epoch, args.num_train_epochs):
-        for step, batch in enumerate(train_dataloader):
-            if t0 is None and global_step == args.throughput_warmup_steps:
-                t0 = time.perf_counter()
-            with accelerator.accumulate(controlnet):
-                # Convert images to latent space
-                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-                latents = latents * vae.config.scaling_factor
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
-                bsz = latents.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-                timesteps = timesteps.long()
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
-
-                controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)
-
-                down_block_res_samples, mid_block_res_sample = controlnet(
-                    noisy_latents,
-                    timesteps,
-                    encoder_hidden_states=encoder_hidden_states,
-                    controlnet_cond=controlnet_image,
-                    return_dict=False,
-                )
-
-                # Predict the noise residual
-                model_pred = unet(
-                    noisy_latents,
-                    timesteps,
-                    encoder_hidden_states=encoder_hidden_states,
-                    down_block_additional_residuals=[
-                        sample.to(dtype=weight_dtype) for sample in down_block_res_samples
-                    ],
-                    mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
-                    return_dict=False,
-                )[0]
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-                accelerator.backward(loss)
-                htcore.mark_step()
-                if accelerator.sync_gradients:
-                    params_to_clip = controlnet.parameters()
-                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad(set_to_none=True)
-                htcore.mark_step()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-                if accelerator.is_main_process:
-                    if global_step % args.checkpointing_steps == 0:
-                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
-                        if args.checkpoints_total_limit is not None:
-                            checkpoints = os.listdir(args.output_dir)
-                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
-                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
-
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
-                            if len(checkpoints) >= args.checkpoints_total_limit:
-                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
-                                removing_checkpoints = checkpoints[0:num_to_remove]
-
-                                logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
-                                )
-                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
-
-                                for removing_checkpoint in removing_checkpoints:
-                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
-                                    shutil.rmtree(removing_checkpoint)
-
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
-                        logger.info(f"Saved state to {save_path}")
-
-                    if args.validation_prompt is not None and global_step % args.validation_steps == 0:
-                        image_logs = log_validation(
-                            vae,
-                            text_encoder,
-                            tokenizer,
-                            unet,
-                            controlnet,
-                            args,
-                            accelerator,
-                            noise_scheduler,
-                            weight_dtype,
-                            global_step,
-                            gaudi_config,
-                        )
-
-            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-            accelerator.log(logs, step=global_step)
-
-            if global_step >= args.max_train_steps:
-                break
-
-    duration = time.perf_counter() - t0
-    throughput = (args.max_train_steps - args.throughput_warmup_steps) * total_batch_size / duration
-
-    # Create the pipeline using using the trained modules and save it.
-    accelerator.wait_for_everyone()
-    if accelerator.is_main_process:
-        logger.info(f"Throughput = {throughput} samples/s")
-        logger.info(f"Train runtime = {duration} seconds")
-        metrics = {
-            "train_samples_per_second": throughput,
-            "train_runtime": duration,
-        }
-        with open(f"{args.output_dir}/speed_metrics.json", mode="w") as file:
-            json.dump(metrics, file)
-        controlnet = unwrap_model(controlnet)
-        controlnet.save_pretrained(args.output_dir)
-
-        if args.push_to_hub:
-            save_model_card(
-                repo_id,
-                image_logs=image_logs,
-                base_model=args.pretrained_model_name_or_path,
-                repo_folder=args.output_dir,
-            )
-            upload_folder(
-                repo_id=repo_id,
-                folder_path=args.output_dir,
-                commit_message="End of training",
-                ignore_patterns=["step_*", "epoch_*"],
-            )
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/stable-diffusion/training/train_dreambooth.py b/examples/stable-diffusion/training/train_dreambooth.py
deleted file mode 100644
index b34f3c12c5..0000000000
--- a/examples/stable-diffusion/training/train_dreambooth.py
+++ /dev/null
@@ -1,1357 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-"""
-Training script for DreamBooth to Text-to-Image Diffusion Models
-Adapted from the following source:
-https://github.com/huggingface/peft/blob/608a90ded9985ee1c5912d738082bb1fd618902b/examples/stable_diffusion/train_dreambooth.py
-"""
-
-import argparse
-import gc
-import hashlib
-import itertools
-import logging
-import math
-import os
-import threading
-import warnings
-from pathlib import Path
-from typing import Union
-
-import datasets
-import diffusers
-import habana_frameworks.torch.core as htcore
-import numpy as np
-import psutil
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-import transformers
-from accelerate.logging import get_logger
-from accelerate.utils import DistributedDataParallelKwargs
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DPMSolverMultistepScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.optimization import get_scheduler
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.torch_utils import is_compiled_module
-from habana_frameworks.torch.hpu import memory_stats
-from huggingface_hub import HfApi
-from peft import LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, get_peft_model
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-from optimum.habana import GaudiConfig
-from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.accelerate.utils.dataclasses import GaudiDistributedType
-from optimum.habana.diffusers import GaudiStableDiffusionPipeline
-from optimum.habana.transformers.trainer import _is_peft_model
-from optimum.habana.utils import set_seed
-
-
-logger = get_logger(__name__)
-
-UNET_TARGET_MODULES = [
-    "to_q",
-    "to_k",
-    "to_v",
-    "proj",
-    "proj_in",
-    "proj_out",
-    "conv",
-    "conv1",
-    "conv2",
-    "conv_shortcut",
-    "to_out.0",
-    "time_emb_proj",
-    "ff.net.2",
-]
-
-TEXT_ENCODER_TARGET_MODULES = ["fc1", "fc2", "q_proj", "k_proj", "v_proj", "out_proj"]
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=revision,
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
-
-        return RobertaSeriesModelWithTransformation
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def create_unet_adapter_config(args: argparse.Namespace) -> Union[LoraConfig, LoHaConfig, LoKrConfig, OFTConfig]:
-    if args.adapter == "full":
-        raise ValueError("Cannot create unet adapter config for full parameter")
-
-    if args.adapter == "lora":
-        config = LoraConfig(
-            r=args.unet_r,
-            lora_alpha=args.unet_alpha,
-            target_modules=UNET_TARGET_MODULES,
-            lora_dropout=args.unet_dropout,
-            bias=args.unet_bias,
-            init_lora_weights=True,
-        )
-    elif args.adapter == "loha":
-        config = LoHaConfig(
-            r=args.unet_r,
-            alpha=args.unet_alpha,
-            target_modules=UNET_TARGET_MODULES,
-            rank_dropout=args.unet_rank_dropout,
-            module_dropout=args.unet_module_dropout,
-            use_effective_conv2d=args.unet_use_effective_conv2d,
-            init_weights=True,
-        )
-    elif args.adapter == "lokr":
-        config = LoKrConfig(
-            r=args.unet_r,
-            alpha=args.unet_alpha,
-            target_modules=UNET_TARGET_MODULES,
-            rank_dropout=args.unet_rank_dropout,
-            module_dropout=args.unet_module_dropout,
-            use_effective_conv2d=args.unet_use_effective_conv2d,
-            decompose_both=args.unet_decompose_both,
-            decompose_factor=args.unet_decompose_factor,
-            init_weights=True,
-        )
-    elif args.adapter == "oft":
-        config = OFTConfig(
-            r=args.unet_r,
-            target_modules=UNET_TARGET_MODULES,
-            module_dropout=args.unet_dropout,
-            init_weights=True,
-            coft=args.unet_use_coft,
-            eps=args.unet_eps,
-        )
-    else:
-        raise ValueError(f"Unknown adapter type {args.adapter}")
-
-    return config
-
-
-def create_text_encoder_adapter_config(
-    args: argparse.Namespace,
-) -> Union[LoraConfig, LoHaConfig, LoKrConfig, OFTConfig]:
-    if args.adapter == "full":
-        raise ValueError("Cannot create text_encoder adapter config for full parameter")
-
-    if args.adapter == "lora":
-        config = LoraConfig(
-            r=args.te_r,
-            lora_alpha=args.te_alpha,
-            target_modules=TEXT_ENCODER_TARGET_MODULES,
-            lora_dropout=args.te_dropout,
-            bias=args.te_bias,
-            init_lora_weights=True,
-        )
-    elif args.adapter == "loha":
-        config = LoHaConfig(
-            r=args.te_r,
-            alpha=args.te_alpha,
-            target_modules=TEXT_ENCODER_TARGET_MODULES,
-            rank_dropout=args.te_rank_dropout,
-            module_dropout=args.te_module_dropout,
-            init_weights=True,
-        )
-    elif args.adapter == "lokr":
-        config = LoKrConfig(
-            r=args.te_r,
-            alpha=args.te_alpha,
-            target_modules=TEXT_ENCODER_TARGET_MODULES,
-            rank_dropout=args.te_rank_dropout,
-            module_dropout=args.te_module_dropout,
-            decompose_both=args.te_decompose_both,
-            decompose_factor=args.te_decompose_factor,
-            init_weights=True,
-        )
-    elif args.adapter == "oft":
-        config = OFTConfig(
-            r=args.te_r,
-            target_modules=TEXT_ENCODER_TARGET_MODULES,
-            module_dropout=args.te_dropout,
-            init_weights=True,
-            coft=args.te_use_coft,
-            eps=args.te_eps,
-        )
-    else:
-        raise ValueError(f"Unknown adapter type {args.adapter}")
-
-    return config
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--instance_data_dir",
-        type=str,
-        default=None,
-        required=True,
-        help="A folder containing the training data of instance images.",
-    )
-    parser.add_argument(
-        "--class_data_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="A folder containing the training data of class images.",
-    )
-    parser.add_argument(
-        "--instance_prompt",
-        type=str,
-        default=None,
-        required=True,
-        help="The prompt with identifier specifying the instance",
-    )
-    parser.add_argument(
-        "--class_prompt",
-        type=str,
-        default=None,
-        help="The prompt to specify images in the same class as provided instance images.",
-    )
-    parser.add_argument(
-        "--with_prior_preservation",
-        default=False,
-        action="store_true",
-        help="Flag to add prior preservation loss.",
-    )
-    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
-    parser.add_argument(
-        "--num_class_images",
-        type=int,
-        default=100,
-        help=(
-            "Minimal class images for prior preservation loss. If there are not enough images already present in"
-            " class_data_dir, additional images will be sampled with class_prompt."
-        ),
-    )
-    parser.add_argument(
-        "--validation_prompt",
-        type=str,
-        default=None,
-        help="A prompt that is used during validation to verify that the model is learning.",
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images that should be generated during validation with `validation_prompt`.",
-    )
-    parser.add_argument(
-        "--validation_steps",
-        type=int,
-        default=100,
-        help=(
-            "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="text-inversion-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
-    )
-    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
-
-    parser.add_argument(
-        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument(
-        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=(
-            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
-            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
-            " training using `--resume_from_checkpoint`."
-        ),
-    )
-    parser.add_argument(
-        "--resume_from_checkpoint",
-        type=str,
-        default=None,
-        help=(
-            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
-            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
-        ),
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-6,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="tensorboard",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
-            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
-        ),
-    )
-    parser.add_argument(
-        "--wandb_key",
-        type=str,
-        default=None,
-        help=("If report to option is set to wandb, api-key for wandb used for login to wandb "),
-    )
-    parser.add_argument(
-        "--wandb_project_name",
-        type=str,
-        default=None,
-        help=("If report to option is set to wandb, project name in wandb for log tracking  "),
-    )
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default=None,
-        choices=["no", "bf16"],
-        help=(
-            "Whether to use mixed precision. Default to the value of accelerate config of the current system or the"
-            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
-        ),
-    )
-    parser.add_argument(
-        "--prior_generation_precision",
-        type=str,
-        default=None,
-        choices=["no", "fp32", "bf16"],
-        help=("Choose prior generation precision between fp32 and bf16 (bfloat16)."),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-    parser.add_argument(
-        "--gaudi_config_name",
-        type=str,
-        default=None,
-        help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.",
-    )
-    parser.add_argument(
-        "--throughput_warmup_steps",
-        type=int,
-        default=0,
-        help=(
-            "Number of steps to ignore for throughput calculation. For example, with throughput_warmup_steps=N, the"
-            " first N steps will not be considered in the calculation of the throughput. This is especially useful in"
-            " lazy mode."
-        ),
-    )
-    parser.add_argument(
-        "--use_hpu_graphs_for_training",
-        action="store_true",
-        help="Use HPU graphs for training on HPU.",
-    )
-    parser.add_argument(
-        "--use_hpu_graphs_for_inference",
-        action="store_true",
-        help="Use HPU graphs for inference on HPU.",
-    )
-
-    # Adapter arguments
-    subparsers = parser.add_subparsers(dest="adapter")
-
-    # Dummy subparser to train whole model
-    subparsers.add_parser("full", help="Train full model without adapters")
-
-    # LoRA adapter
-    lora = subparsers.add_parser("lora", help="Use LoRA adapter")
-    lora.add_argument("--unet_r", type=int, default=8, help="LoRA rank for unet")
-    lora.add_argument("--unet_alpha", type=int, default=8, help="LoRA alpha for unet")
-    lora.add_argument("--unet_dropout", type=float, default=0.0, help="LoRA dropout probability for unet")
-    lora.add_argument(
-        "--unet_bias",
-        type=str,
-        default="none",
-        help="Bias type for LoRA. Can be 'none', 'all' or 'lora_only'",
-    )
-    lora.add_argument(
-        "--te_r", type=int, default=8, help="LoRA rank for text_encoder, only used if `train_text_encoder` is True"
-    )
-    lora.add_argument(
-        "--te_alpha",
-        type=int,
-        default=8,
-        help="LoRA alpha for text_encoder, only used if `train_text_encoder` is True",
-    )
-    lora.add_argument(
-        "--te_dropout",
-        type=float,
-        default=0.0,
-        help="LoRA dropout probability for text_encoder, only used if `train_text_encoder` is True",
-    )
-    lora.add_argument(
-        "--te_bias",
-        type=str,
-        default="none",
-        help="Bias type for LoRA. Can be 'none', 'all' or 'lora_only', only used if `train_text_encoder` is True",
-    )
-
-    # LoHa adapter
-    loha = subparsers.add_parser("loha", help="Use LoHa adapter")
-    loha.add_argument("--unet_r", type=int, default=8, help="LoHa rank for unet")
-    loha.add_argument("--unet_alpha", type=int, default=8, help="LoHa alpha for unet")
-    loha.add_argument("--unet_rank_dropout", type=float, default=0.0, help="LoHa rank_dropout probability for unet")
-    loha.add_argument(
-        "--unet_module_dropout", type=float, default=0.0, help="LoHa module_dropout probability for unet"
-    )
-    loha.add_argument(
-        "--unet_use_effective_conv2d",
-        action="store_true",
-        help="Use parameter effective decomposition in unet for Conv2d 3x3 with ksize > 1",
-    )
-    loha.add_argument(
-        "--te_r", type=int, default=8, help="LoHa rank for text_encoder, only used if `train_text_encoder` is True"
-    )
-    loha.add_argument(
-        "--te_alpha",
-        type=int,
-        default=8,
-        help="LoHa alpha for text_encoder, only used if `train_text_encoder` is True",
-    )
-    loha.add_argument(
-        "--te_rank_dropout",
-        type=float,
-        default=0.0,
-        help="LoHa rank_dropout probability for text_encoder, only used if `train_text_encoder` is True",
-    )
-    loha.add_argument(
-        "--te_module_dropout",
-        type=float,
-        default=0.0,
-        help="LoHa module_dropout probability for text_encoder, only used if `train_text_encoder` is True",
-    )
-
-    # LoKr adapter
-    lokr = subparsers.add_parser("lokr", help="Use LoKr adapter")
-    lokr.add_argument("--unet_r", type=int, default=8, help="LoKr rank for unet")
-    lokr.add_argument("--unet_alpha", type=int, default=8, help="LoKr alpha for unet")
-    lokr.add_argument("--unet_rank_dropout", type=float, default=0.0, help="LoKr rank_dropout probability for unet")
-    lokr.add_argument(
-        "--unet_module_dropout", type=float, default=0.0, help="LoKr module_dropout probability for unet"
-    )
-    lokr.add_argument(
-        "--unet_use_effective_conv2d",
-        action="store_true",
-        help="Use parameter effective decomposition in unet for Conv2d 3x3 with ksize > 1",
-    )
-    lokr.add_argument(
-        "--unet_decompose_both", action="store_true", help="Decompose left matrix in kronecker product for unet"
-    )
-    lokr.add_argument(
-        "--unet_decompose_factor", type=int, default=-1, help="Decompose factor in kronecker product for unet"
-    )
-    lokr.add_argument(
-        "--te_r", type=int, default=8, help="LoKr rank for text_encoder, only used if `train_text_encoder` is True"
-    )
-    lokr.add_argument(
-        "--te_alpha",
-        type=int,
-        default=8,
-        help="LoKr alpha for text_encoder, only used if `train_text_encoder` is True",
-    )
-    lokr.add_argument(
-        "--te_rank_dropout",
-        type=float,
-        default=0.0,
-        help="LoKr rank_dropout probability for text_encoder, only used if `train_text_encoder` is True",
-    )
-    lokr.add_argument(
-        "--te_module_dropout",
-        type=float,
-        default=0.0,
-        help="LoKr module_dropout probability for text_encoder, only used if `train_text_encoder` is True",
-    )
-    lokr.add_argument(
-        "--te_decompose_both",
-        action="store_true",
-        help="Decompose left matrix in kronecker product for text_encoder, only used if `train_text_encoder` is True",
-    )
-    lokr.add_argument(
-        "--te_decompose_factor",
-        type=int,
-        default=-1,
-        help="Decompose factor in kronecker product for text_encoder, only used if `train_text_encoder` is True",
-    )
-    # oft adapter
-    oft = subparsers.add_parser("oft", help="Use Oft adapter")
-    oft.add_argument("--unet_r", type=int, default=8, help="Oft rank for unet")
-    oft.add_argument("--unet_dropout", type=float, default=0.0, help="Oft dropout probability for unet")
-    oft.add_argument("--unet_use_coft", action="store_true", help="Using constrained OFT in unet")
-    oft.add_argument("--unet_eps", type=float, default=0.0, help="The control strength of COFT for unet")
-    oft.add_argument(
-        "--te_r", type=int, default=8, help="Oft rank for text_encoder, only used if `train_text_encoder` is True"
-    )
-    oft.add_argument(
-        "--te_dropout",
-        type=float,
-        default=0.0,
-        help="Oft dropout probability for text_encoder, only used if `train_text_encoder` is True",
-    )
-    oft.add_argument(
-        "--te_use_coft",
-        action="store_true",
-        help="Using constrained OFT in text_encoder, only used if `train_text_encoder` is True",
-    )
-    oft.add_argument(
-        "--te_eps",
-        type=float,
-        default=0.0,
-        help="The control strength of COFT for text_encoder, only used if `train_text_encoder` is True",
-    )
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.with_prior_preservation:
-        if args.class_data_dir is None:
-            raise ValueError("You must specify a data directory for class images.")
-        if args.class_prompt is None:
-            raise ValueError("You must specify prompt for class images.")
-    else:
-        # logger is not available yet
-        if args.class_data_dir is not None:
-            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
-        if args.class_prompt is not None:
-            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
-
-    return args
-
-
-# Converting Bytes to Megabytes
-def b2mb(x):
-    return int(x / 2**20)
-
-
-# This context manager is used to track the peak memory usage of the process
-class TorchTracemalloc:
-    def __enter__(self):
-        gc.collect()
-        mem_stats = memory_stats()
-
-        self.begin = mem_stats["InUse"]
-        self.process = psutil.Process()
-
-        self.cpu_begin = self.cpu_mem_used()
-        self.peak_monitoring = True
-        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
-        peak_monitor_thread.daemon = True
-        peak_monitor_thread.start()
-        return self
-
-    def cpu_mem_used(self):
-        """get resident set size memory for the current process"""
-        return self.process.memory_info().rss
-
-    def peak_monitor_func(self):
-        self.cpu_peak = -1
-
-        while True:
-            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)
-
-            # can't sleep or will not catch the peak right (this comment is here on purpose)
-            # time.sleep(0.001) # 1msec
-
-            if not self.peak_monitoring:
-                break
-
-    def __exit__(self, *exc):
-        self.peak_monitoring = False
-
-        gc.collect()
-        mem_stats = memory_stats()
-
-        self.end = mem_stats["InUse"]
-        self.peak = mem_stats["MaxInUse"]
-        self.used = b2mb(self.end - self.begin)
-        self.peaked = b2mb(self.peak - self.begin)
-
-        self.cpu_end = self.cpu_mem_used()
-        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
-        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
-
-
-class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images and the tokenizes prompts.
-    """
-
-    def __init__(
-        self,
-        instance_data_root,
-        instance_prompt,
-        tokenizer,
-        class_data_root=None,
-        class_prompt=None,
-        size=512,
-        center_crop=False,
-    ):
-        self.size = size
-        self.center_crop = center_crop
-        self.tokenizer = tokenizer
-
-        self.instance_data_root = Path(instance_data_root)
-        if not self.instance_data_root.exists():
-            raise ValueError("Instance images root doesn't exists.")
-
-        self.instance_images_path = list(Path(instance_data_root).iterdir())
-        self.num_instance_images = len(self.instance_images_path)
-        self.instance_prompt = instance_prompt
-        self._length = self.num_instance_images
-
-        if class_data_root is not None:
-            self.class_data_root = Path(class_data_root)
-            self.class_data_root.mkdir(parents=True, exist_ok=True)
-            self.class_images_path = list(self.class_data_root.iterdir())
-            self.num_class_images = len(self.class_images_path)
-            self._length = max(self.num_class_images, self.num_instance_images)
-            self.class_prompt = class_prompt
-        else:
-            self.class_data_root = None
-
-        self.image_transforms = transforms.Compose(
-            [
-                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
-                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
-                transforms.ToTensor(),
-                transforms.Normalize([0.5], [0.5]),
-            ]
-        )
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, index):
-        example = {}
-        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
-        if not instance_image.mode == "RGB":
-            instance_image = instance_image.convert("RGB")
-        example["instance_images"] = self.image_transforms(instance_image)
-        example["instance_prompt_ids"] = self.tokenizer(
-            self.instance_prompt,
-            truncation=True,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids
-
-        if self.class_data_root:
-            class_image = Image.open(self.class_images_path[index % self.num_class_images])
-            if not class_image.mode == "RGB":
-                class_image = class_image.convert("RGB")
-            example["class_images"] = self.image_transforms(class_image)
-            example["class_prompt_ids"] = self.tokenizer(
-                self.class_prompt,
-                truncation=True,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                return_tensors="pt",
-            ).input_ids
-
-        return example
-
-
-def collate_fn(examples, with_prior_preservation=False):
-    input_ids = [example["instance_prompt_ids"] for example in examples]
-    pixel_values = [example["instance_images"] for example in examples]
-
-    # Concat class and instance examples for prior preservation.
-    # We do this to avoid doing two forward passes.
-    if with_prior_preservation:
-        input_ids += [example["class_prompt_ids"] for example in examples]
-        pixel_values += [example["class_images"] for example in examples]
-
-    pixel_values = torch.stack(pixel_values)
-    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-
-    input_ids = torch.cat(input_ids, dim=0)
-
-    batch = {
-        "input_ids": input_ids,
-        "pixel_values": pixel_values,
-    }
-    return batch
-
-
-class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
-
-    def __init__(self, prompt, num_samples):
-        self.prompt = prompt
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, index):
-        example = {}
-        example["prompt"] = self.prompt
-        example["index"] = index
-        return example
-
-
-def main(args):
-    logging_dir = Path(args.output_dir, args.logging_dir)
-
-    gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name)
-    gaudi_config.use_torch_autocast = gaudi_config.use_torch_autocast or args.mixed_precision == "bf16"
-    accelerator = GaudiAccelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision=args.mixed_precision,
-        log_with=args.report_to,
-        project_dir=logging_dir,
-        force_autocast=gaudi_config.use_torch_autocast,
-    )
-    if args.report_to == "wandb":
-        import wandb
-
-        wandb.login(key=args.wandb_key)
-        wandb.init(project=args.wandb_project_name)
-    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
-    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
-    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
-    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
-        raise ValueError(
-            "Gradient accumulation is not supported when training the text encoder in distributed training. "
-            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
-        )
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Generate class images if prior preservation is enabled.
-    if args.with_prior_preservation:
-        class_images_dir = Path(args.class_data_dir)
-        if not class_images_dir.exists():
-            class_images_dir.mkdir(parents=True)
-        cur_class_images = len(list(class_images_dir.iterdir()))
-
-        if cur_class_images < args.num_class_images:
-            torch_dtype = torch.bfloat16 if accelerator.device.type == "hpu" else torch.float32
-            if args.prior_generation_precision == "fp32":
-                torch_dtype = torch.float32
-            elif args.prior_generation_precision == "bf16":
-                torch_dtype = torch.bfloat16
-            pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
-                safety_checker=None,
-                revision=args.revision,
-                use_hpu_graphs=args.use_hpu_graphs_for_inference,
-                use_habana=True,
-                gaudi_config=gaudi_config,
-            )
-            pipeline.set_progress_bar_config(disable=True)
-
-            num_new_images = args.num_class_images - cur_class_images
-            logger.info(f"Number of class images to sample: {num_new_images}.")
-
-            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
-
-            sample_dataloader = accelerator.prepare(sample_dataloader)
-            pipeline.to(accelerator.device)
-
-            for example in tqdm(
-                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
-            ):
-                images = pipeline(example["prompt"]).images
-
-                for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
-                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    image.save(image_filename)
-
-            del pipeline
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            api = HfApi(token=args.hub_token)
-            # Create repo (repo_name from args or inferred)
-            repo_name = args.hub_model_id
-            if repo_name is None:
-                repo_name = Path(args.output_dir).absolute().name
-            repo_id = api.create_repo(repo_name, exist_ok=True).repo_id
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.pretrained_model_name_or_path,
-            subfolder="tokenizer",
-            revision=args.revision,
-            use_fast=False,
-        )
-
-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
-
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-    )  # DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = text_encoder_cls.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
-    )
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
-    )
-    if args.adapter != "full":
-        config = create_unet_adapter_config(args)
-        unet = get_peft_model(unet, config)
-        unet.print_trainable_parameters()
-    unet.to(accelerator.device)
-    vae.requires_grad_(False)
-    if not args.train_text_encoder:
-        text_encoder.requires_grad_(False)
-    elif args.train_text_encoder and args.adapter != "full":
-        config = create_text_encoder_adapter_config(args)
-        text_encoder = get_peft_model(text_encoder, config)
-        text_encoder.print_trainable_parameters()
-    text_encoder.to(accelerator.device)
-    if args.enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
-            unet.enable_xformers_memory_efficient_attention()
-        else:
-            raise ValueError("xformers is not available. Make sure it is installed correctly")
-
-    if args.gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-        if args.train_text_encoder and not args.adapter != "full":
-            text_encoder.gradient_checkpointing_enable()
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
-        )
-
-    if gaudi_config.use_fused_adam:
-        from habana_frameworks.torch.hpex.optimizers import FusedAdamW
-
-        optimizer_class = FusedAdamW
-    else:
-        optimizer_class = torch.optim.AdamW
-
-    # Optimizer creation
-    params_to_optimize = (
-        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
-    )
-    optimizer = optimizer_class(
-        params_to_optimize,
-        lr=args.learning_rate,
-        betas=(args.adam_beta1, args.adam_beta2),
-        weight_decay=args.adam_weight_decay,
-        eps=args.adam_epsilon,
-    )
-
-    # Dataset and DataLoaders creation:
-    train_dataset = DreamBoothDataset(
-        instance_data_root=args.instance_data_dir,
-        instance_prompt=args.instance_prompt,
-        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
-        class_prompt=args.class_prompt,
-        tokenizer=tokenizer,
-        size=args.resolution,
-        center_crop=args.center_crop,
-    )
-
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset,
-        batch_size=args.train_batch_size,
-        shuffle=True,
-        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
-        num_workers=1,
-    )
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-
-    # Prepare everything with our `accelerator`.
-    if args.train_text_encoder:
-        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
-        )
-    else:
-        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, optimizer, train_dataloader, lr_scheduler
-        )
-
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
-    # Move vae and text_encoder to device and cast to weight_dtype
-    vae.to(accelerator.device, dtype=weight_dtype)
-    if not args.train_text_encoder:
-        text_encoder.to(accelerator.device, dtype=weight_dtype)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        accelerator.init_trackers("dreambooth", config=vars(args))
-
-    def unwrap_model(model, training=False):
-        model = accelerator.unwrap_model(model)
-        model = model._orig_mod if is_compiled_module(model) else model
-        if not training:
-            return model
-        else:
-            if accelerator.distributed_type == GaudiDistributedType.MULTI_HPU:
-                kwargs = {}
-                kwargs["gradient_as_bucket_view"] = True
-                accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
-            if args.use_hpu_graphs_for_training:
-                if _is_peft_model(model):
-                    base_model = model.get_base_model()
-                    htcore.hpu.ModuleCacher()(model=base_model, inplace=True)
-                else:
-                    htcore.hpu.ModuleCacher()(model=model, inplace=True)
-            return model
-
-    unwrap_model(model=unet, training=True)
-    if args.train_text_encoder:
-        unwrap_model(model=text_encoder, training=True)
-    # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    global_step = 0
-    first_epoch = 0
-
-    # Potentially load in the weights and states from a previous save
-    if args.resume_from_checkpoint:
-        if args.resume_from_checkpoint != "latest":
-            path = os.path.basename(args.resume_from_checkpoint)
-        else:
-            # Get the mos recent checkpoint
-            dirs = os.listdir(args.output_dir)
-            dirs = [d for d in dirs if d.startswith("checkpoint")]
-            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1]
-        accelerator.print(f"Resuming from checkpoint {path}")
-        accelerator.load_state(os.path.join(args.output_dir, path))
-        global_step = int(path.split("-")[1])
-
-        resume_global_step = global_step * args.gradient_accumulation_steps
-        first_epoch = resume_global_step // num_update_steps_per_epoch
-        resume_step = resume_global_step % num_update_steps_per_epoch
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
-    progress_bar.set_description("Steps")
-
-    for epoch in range(first_epoch, args.num_train_epochs):
-        unet.train()
-        if args.train_text_encoder:
-            text_encoder.train()
-        with TorchTracemalloc() as tracemalloc:
-            for step, batch in enumerate(train_dataloader):
-                # Skip steps until we reach the resumed step
-                if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
-                    if step % args.gradient_accumulation_steps == 0:
-                        progress_bar.update(1)
-                        if args.report_to == "wandb":
-                            accelerator.print(progress_bar)
-                    continue
-
-                with accelerator.accumulate(unet):
-                    # Convert images to latent space
-                    latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-                    latents = latents * 0.18215
-
-                    # Sample noise that we'll add to the latents
-                    noise = torch.randn_like(latents)
-                    bsz = latents.shape[0]
-                    # Sample a random timestep for each image
-                    timesteps = torch.randint(
-                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
-                    )
-                    timesteps = timesteps.long()
-
-                    # Add noise to the latents according to the noise magnitude at each timestep
-                    # (this is the forward diffusion process)
-                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-                    # Get the text embedding for conditioning
-                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]
-
-                    # Predict the noise residual
-                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                    # Get the target for loss depending on the prediction type
-                    if noise_scheduler.config.prediction_type == "epsilon":
-                        target = noise
-                    elif noise_scheduler.config.prediction_type == "v_prediction":
-                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                    else:
-                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                    if args.with_prior_preservation:
-                        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                        model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
-                        target, target_prior = torch.chunk(target, 2, dim=0)
-
-                        # Compute instance loss
-                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-                        # Compute prior loss
-                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
-
-                        # Add the prior loss to the instance loss.
-                        loss = loss + args.prior_loss_weight * prior_loss
-                    else:
-                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-                    accelerator.backward(loss)
-                    htcore.mark_step()
-                    if accelerator.sync_gradients:
-                        params_to_clip = (
-                            itertools.chain(unet.parameters(), text_encoder.parameters())
-                            if args.train_text_encoder
-                            else unet.parameters()
-                        )
-                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-                    optimizer.step()
-                    lr_scheduler.step()
-                    optimizer.zero_grad(set_to_none=True)
-                    htcore.mark_step()
-
-                # Checks if the accelerator has performed an optimization step behind the scenes
-                if accelerator.sync_gradients:
-                    progress_bar.update(1)
-                    if args.report_to == "wandb":
-                        accelerator.print(progress_bar)
-                    global_step += 1
-
-                    # if global_step % args.checkpointing_steps == 0:
-                    #     if accelerator.is_main_process:
-                    #         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    #         accelerator.save_state(save_path)
-                    #         logger.info(f"Saved state to {save_path}")
-
-                logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
-                progress_bar.set_postfix(**logs)
-                accelerator.log(logs, step=global_step)
-
-                if (
-                    args.validation_prompt is not None
-                    and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0
-                ):
-                    logger.info(
-                        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                        f" {args.validation_prompt}."
-                    )
-                    # create pipeline
-                    pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-                        args.pretrained_model_name_or_path,
-                        safety_checker=None,
-                        revision=args.revision,
-                        use_hpu_graphs=args.use_hpu_graphs_for_inference,
-                        use_habana=True,
-                        gaudi_config=gaudi_config,
-                    )
-                    # set `keep_fp32_wrapper` to True because we do not want to remove
-                    # mixed precision hooks while we are still training
-                    pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
-                    pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
-                    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-                    pipeline = pipeline.to(accelerator.device)
-                    pipeline.set_progress_bar_config(disable=True)
-
-                    # Set evaliation mode
-                    pipeline.unet.eval()
-                    pipeline.text_encoder.eval()
-
-                    # run inference
-                    if args.seed is not None:
-                        if accelerator.device == torch.device("hpu"):
-                            # torch.Generator() is unsupported on HPU
-                            generator = set_seed(args.seed)
-                        else:
-                            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
-                    else:
-                        generator = None
-                    images = []
-                    for _ in range(args.num_validation_images):
-                        image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
-                        images.append(image)
-
-                    for tracker in accelerator.trackers:
-                        if tracker.name == "tensorboard":
-                            np_images = np.stack([np.asarray(img) for img in images])
-                            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
-                        if tracker.name == "wandb":
-                            import wandb
-
-                            tracker.log(
-                                {
-                                    "validation": [
-                                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                                        for i, image in enumerate(images)
-                                    ]
-                                }
-                            )
-
-                    # Set evaliation mode
-                    pipeline.unet.train()
-                    if args.train_text_encoder:
-                        pipeline.text_encoder.train()
-
-                    del pipeline
-
-                if global_step >= args.max_train_steps:
-                    break
-        # Printing the HPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print(f"HPU Memory before entering the train : {b2mb(tracemalloc.begin)}")
-        accelerator.print(f"HPU Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
-        accelerator.print(f"HPU Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
-        accelerator.print(
-            f"HPU Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
-        )
-
-        accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}")
-        accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}")
-        accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}")
-        accelerator.print(
-            f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}"
-        )
-
-    # Create the pipeline using using the trained modules and save it.
-    accelerator.wait_for_everyone()
-    if accelerator.is_main_process:
-        if args.adapter != "full":
-            unwarpped_unet = unwrap_model(unet)
-            unwarpped_unet.save_pretrained(
-                os.path.join(args.output_dir, "unet"), state_dict=accelerator.get_state_dict(unet)
-            )
-            if args.train_text_encoder:
-                unwarpped_text_encoder = unwrap_model(text_encoder)
-                unwarpped_text_encoder.save_pretrained(
-                    os.path.join(args.output_dir, "text_encoder"),
-                    state_dict=accelerator.get_state_dict(text_encoder),
-                )
-        else:
-            pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path,
-                unet=unwrap_model(unet),
-                text_encoder=unwrap_model(text_encoder),
-                revision=args.revision,
-                use_hpu_graphs=args.use_hpu_graphs_for_inference,
-                use_habana=True,
-                gaudi_config=gaudi_config,
-            )
-            pipeline.save_pretrained(args.output_dir)
-
-        if args.push_to_hub:
-            api.upload_folder(
-                repo_id=repo_id,
-                folder_path=args.output_dir,
-                commit_message="End of training",
-                run_as_future=True,
-            )
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
deleted file mode 100644
index ea34c50773..0000000000
--- a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
+++ /dev/null
@@ -1,1768 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-"""
-Training script for LORA DreamBooth to Text-to-Image Diffusion Models
-Adapted from the following source:
-https://github.com/huggingface/diffusers/blob/v0.26.3/examples/dreambooth/train_dreambooth_lora_sdxl.py
-"""
-
-import argparse
-import gc
-import itertools
-import logging
-import math
-import os
-import shutil
-import warnings
-from pathlib import Path
-
-import diffusers
-import habana_frameworks.torch.core as htcore
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-import transformers
-from accelerate.logging import get_logger
-from accelerate.utils import DistributedDataParallelKwargs
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DPMSolverMultistepScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.loaders import LoraLoaderMixin
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import _set_state_dict_into_text_encoder, compute_snr
-from diffusers.utils import (
-    check_min_version,
-    convert_state_dict_to_diffusers,
-    convert_unet_state_dict_to_peft,
-    is_wandb_available,
-)
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.torch_utils import is_compiled_module
-from huggingface_hub import create_repo, upload_folder
-from huggingface_hub.utils import insecure_hashlib
-from packaging import version
-from peft import LoraConfig, set_peft_model_state_dict
-from peft.utils import get_peft_model_state_dict
-from PIL import Image
-from PIL.ImageOps import exif_transpose
-from torch.utils.data import Dataset
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-from optimum.habana import GaudiConfig
-from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.accelerate.utils.dataclasses import GaudiDistributedType
-from optimum.habana.diffusers import GaudiStableDiffusionXLPipeline
-from optimum.habana.transformers.trainer import _is_peft_model
-from optimum.habana.utils import set_seed
-
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
-
-logger = get_logger(__name__)
-
-
-def save_model_card(
-    repo_id: str,
-    images=None,
-    base_model=str,
-    train_text_encoder=False,
-    instance_prompt=str,
-    validation_prompt=str,
-    repo_folder=None,
-    vae_path=None,
-):
-    img_str = "widget:\n" if images else ""
-    for i, image in enumerate(images):
-        image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"""
-        - text: '{validation_prompt if validation_prompt else ' ' }'
-          output:
-            url:
-                "image_{i}.png"
-        """
-
-    yaml = f"""
----
-tags:
-- stable-diffusion-xl
-- stable-diffusion-xl-diffusers
-- text-to-image
-- diffusers
-- lora
-- template:sd-lora
-{img_str}
-base_model: {base_model}
-instance_prompt: {instance_prompt}
-license: openrail++
----
-    """
-
-    model_card = f"""
-# SDXL LoRA DreamBooth - {repo_id}
-
-<Gallery />
-
-## Model description
-
-These are {repo_id} LoRA adaption weights for {base_model}.
-
-The weights were trained  using [DreamBooth](https://dreambooth.github.io/).
-
-LoRA for the text encoder was enabled: {train_text_encoder}.
-
-Special VAE used for training: {vae_path}.
-
-## Trigger words
-
-You should use {instance_prompt} to trigger the image generation.
-
-## Download model
-
-Weights for this model are available in Safetensors format.
-
-[Download]({repo_id}/tree/main) them in the Files & versions tab.
-
-"""
-    with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)
-
-
-def import_model_class_from_model_name_or_path(
-    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
-):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "CLIPTextModelWithProjection":
-        from transformers import CLIPTextModelWithProjection
-
-        return CLIPTextModelWithProjection
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--pretrained_vae_model_name_or_path",
-        type=str,
-        default=None,
-        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--variant",
-        type=str,
-        default=None,
-        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help=(
-            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
-            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand."
-        ),
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The config of the Dataset, leave as None if there's only one config.",
-    )
-    parser.add_argument(
-        "--instance_data_dir",
-        type=str,
-        default=None,
-        help=("A folder containing the training data. "),
-    )
-
-    parser.add_argument(
-        "--cache_dir",
-        type=str,
-        default=None,
-        help="The directory where the downloaded models and datasets will be stored.",
-    )
-
-    parser.add_argument(
-        "--image_column",
-        type=str,
-        default="image",
-        help="The column of the dataset containing the target image. By "
-        "default, the standard Image Dataset maps out 'file_name' "
-        "to 'image'.",
-    )
-    parser.add_argument(
-        "--caption_column",
-        type=str,
-        default=None,
-        help="The column of the dataset containing the instance prompt for each image",
-    )
-
-    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
-
-    parser.add_argument(
-        "--class_data_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="A folder containing the training data of class images.",
-    )
-    parser.add_argument(
-        "--instance_prompt",
-        type=str,
-        default=None,
-        required=True,
-        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
-    )
-    parser.add_argument(
-        "--class_prompt",
-        type=str,
-        default=None,
-        help="The prompt to specify images in the same class as provided instance images.",
-    )
-    parser.add_argument(
-        "--validation_prompt",
-        type=str,
-        default=None,
-        help="A prompt that is used during validation to verify that the model is learning.",
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images that should be generated during validation with `validation_prompt`.",
-    )
-    parser.add_argument(
-        "--validation_epochs",
-        type=int,
-        default=50,
-        help=(
-            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`."
-        ),
-    )
-    parser.add_argument(
-        "--with_prior_preservation",
-        default=False,
-        action="store_true",
-        help="Flag to add prior preservation loss.",
-    )
-    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
-    parser.add_argument(
-        "--num_class_images",
-        type=int,
-        default=100,
-        help=(
-            "Minimal class images for prior preservation loss. If there are not enough images already present in"
-            " class_data_dir, additional images will be sampled with class_prompt."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="lora-dreambooth-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=1024,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--crops_coords_top_left_h",
-        type=int,
-        default=0,
-        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
-    )
-    parser.add_argument(
-        "--crops_coords_top_left_w",
-        type=int,
-        default=0,
-        help=("Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet."),
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument(
-        "--train_text_encoder",
-        action="store_true",
-        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument(
-        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=(
-            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
-            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
-            " training using `--resume_from_checkpoint`."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoints_total_limit",
-        type=int,
-        default=None,
-        help=("Max number of checkpoints to store."),
-    )
-    parser.add_argument(
-        "--resume_from_checkpoint",
-        type=str,
-        default=None,
-        help=(
-            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
-            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
-        ),
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=1e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-
-    parser.add_argument(
-        "--text_encoder_lr",
-        type=float,
-        default=5e-6,
-        help="Text encoder learning rate to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-
-    parser.add_argument(
-        "--snr_gamma",
-        type=float,
-        default=None,
-        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
-        "More details here: https://arxiv.org/abs/2303.09556.",
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-
-    parser.add_argument(
-        "--optimizer",
-        type=str,
-        default="AdamW",
-        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
-    )
-
-    parser.add_argument(
-        "--use_8bit_adam",
-        action="store_true",
-        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
-    )
-
-    parser.add_argument(
-        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
-    )
-    parser.add_argument(
-        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
-    )
-    parser.add_argument(
-        "--prodigy_beta3",
-        type=float,
-        default=None,
-        help="coefficients for computing the Prodidy stepsize using running averages. If set to None, "
-        "uses the value of square root of beta2. Ignored if optimizer is adamW",
-    )
-    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
-    parser.add_argument(
-        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
-    )
-
-    parser.add_argument(
-        "--adam_epsilon",
-        type=float,
-        default=1e-08,
-        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
-    )
-
-    parser.add_argument(
-        "--prodigy_use_bias_correction",
-        type=bool,
-        default=True,
-        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
-    )
-    parser.add_argument(
-        "--prodigy_safeguard_warmup",
-        type=bool,
-        default=True,
-        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
-        "Ignored if optimizer is adamW",
-    )
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="tensorboard",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
-            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
-        ),
-    )
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default=None,
-        choices=["no", "bf16"],
-        help=(
-            "Whether to use mixed precision. Default to the value of accelerate config of the current system or the"
-            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
-        ),
-    )
-    parser.add_argument(
-        "--prior_generation_precision",
-        type=str,
-        default=None,
-        choices=["no", "fp32", "bf16"],
-        help=("Choose prior generation precision between fp32 and bf16 (bfloat16)."),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-    parser.add_argument(
-        "--rank",
-        type=int,
-        default=4,
-        help=("The dimension of the LoRA update matrices."),
-    )
-    parser.add_argument(
-        "--gaudi_config_name",
-        type=str,
-        default=None,
-        help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.",
-    )
-    parser.add_argument(
-        "--use_hpu_graphs_for_training",
-        action="store_true",
-        help="Use HPU graphs for training on HPU.",
-    )
-    parser.add_argument(
-        "--use_hpu_graphs_for_inference",
-        action="store_true",
-        help="Use HPU graphs for inference on HPU.",
-    )
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    if args.dataset_name is None and args.instance_data_dir is None:
-        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
-
-    if args.dataset_name is not None and args.instance_data_dir is not None:
-        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
-
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.with_prior_preservation:
-        if args.class_data_dir is None:
-            raise ValueError("You must specify a data directory for class images.")
-        if args.class_prompt is None:
-            raise ValueError("You must specify prompt for class images.")
-    else:
-        # logger is not available yet
-        if args.class_data_dir is not None:
-            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
-        if args.class_prompt is not None:
-            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
-
-    return args
-
-
-class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images.
-    """
-
-    def __init__(
-        self,
-        instance_data_root,
-        instance_prompt,
-        class_prompt,
-        class_data_root=None,
-        class_num=None,
-        size=1024,
-        repeats=1,
-        center_crop=False,
-    ):
-        self.size = size
-        self.center_crop = center_crop
-
-        self.instance_prompt = instance_prompt
-        self.custom_instance_prompts = None
-        self.class_prompt = class_prompt
-
-        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
-        # we load the training data using load_dataset
-        if args.dataset_name is not None:
-            try:
-                from datasets import load_dataset
-            except ImportError:
-                raise ImportError(
-                    "You are trying to load your data using the datasets library. If you wish to train using custom "
-                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
-                    "local folder containing images only, specify --instance_data_dir instead."
-                )
-            # Downloading and loading a dataset from the hub.
-            # See more about loading custom images at
-            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
-            dataset = load_dataset(
-                args.dataset_name,
-                args.dataset_config_name,
-                cache_dir=args.cache_dir,
-            )
-            # Preprocessing the datasets.
-            column_names = dataset["train"].column_names
-
-            # 6. Get the column names for input/target.
-            if args.image_column is None:
-                image_column = column_names[0]
-                logger.info(f"image column defaulting to {image_column}")
-            else:
-                image_column = args.image_column
-                if image_column not in column_names:
-                    raise ValueError(
-                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
-                    )
-            instance_images = dataset["train"][image_column]
-
-            if args.caption_column is None:
-                logger.info(
-                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
-                    "contains captions/prompts for the images, make sure to specify the "
-                    "column as --caption_column"
-                )
-                self.custom_instance_prompts = None
-            else:
-                if args.caption_column not in column_names:
-                    raise ValueError(
-                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
-                    )
-                custom_instance_prompts = dataset["train"][args.caption_column]
-                # create final list of captions according to --repeats
-                self.custom_instance_prompts = []
-                for caption in custom_instance_prompts:
-                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
-        else:
-            self.instance_data_root = Path(instance_data_root)
-            if not self.instance_data_root.exists():
-                raise ValueError("Instance images root doesn't exists.")
-
-            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
-            self.custom_instance_prompts = None
-
-        self.instance_images = []
-        for img in instance_images:
-            self.instance_images.extend(itertools.repeat(img, repeats))
-        self.num_instance_images = len(self.instance_images)
-        self._length = self.num_instance_images
-
-        if class_data_root is not None:
-            self.class_data_root = Path(class_data_root)
-            self.class_data_root.mkdir(parents=True, exist_ok=True)
-            self.class_images_path = list(self.class_data_root.iterdir())
-            if class_num is not None:
-                self.num_class_images = min(len(self.class_images_path), class_num)
-            else:
-                self.num_class_images = len(self.class_images_path)
-            self._length = max(self.num_class_images, self.num_instance_images)
-        else:
-            self.class_data_root = None
-
-        self.image_transforms = transforms.Compose(
-            [
-                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
-                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
-                transforms.ToTensor(),
-                transforms.Normalize([0.5], [0.5]),
-            ]
-        )
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, index):
-        example = {}
-        instance_image = self.instance_images[index % self.num_instance_images]
-        instance_image = exif_transpose(instance_image)
-
-        if not instance_image.mode == "RGB":
-            instance_image = instance_image.convert("RGB")
-        example["instance_images"] = self.image_transforms(instance_image)
-
-        if self.custom_instance_prompts:
-            caption = self.custom_instance_prompts[index % self.num_instance_images]
-            if caption:
-                example["instance_prompt"] = caption
-            else:
-                example["instance_prompt"] = self.instance_prompt
-
-        else:  # costum prompts were provided, but length does not match size of image dataset
-            example["instance_prompt"] = self.instance_prompt
-
-        if self.class_data_root:
-            class_image = Image.open(self.class_images_path[index % self.num_class_images])
-            class_image = exif_transpose(class_image)
-
-            if not class_image.mode == "RGB":
-                class_image = class_image.convert("RGB")
-            example["class_images"] = self.image_transforms(class_image)
-            example["class_prompt"] = self.class_prompt
-
-        return example
-
-
-def collate_fn(examples, with_prior_preservation=False):
-    pixel_values = [example["instance_images"] for example in examples]
-    prompts = [example["instance_prompt"] for example in examples]
-
-    # Concat class and instance examples for prior preservation.
-    # We do this to avoid doing two forward passes.
-    if with_prior_preservation:
-        pixel_values += [example["class_images"] for example in examples]
-        prompts += [example["class_prompt"] for example in examples]
-
-    pixel_values = torch.stack(pixel_values)
-    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-
-    batch = {"pixel_values": pixel_values, "prompts": prompts}
-    return batch
-
-
-class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
-
-    def __init__(self, prompt, num_samples):
-        self.prompt = prompt
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, index):
-        example = {}
-        example["prompt"] = self.prompt
-        example["index"] = index
-        return example
-
-
-def tokenize_prompt(tokenizer, prompt):
-    text_inputs = tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    text_input_ids = text_inputs.input_ids
-    return text_input_ids
-
-
-# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
-def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
-    prompt_embeds_list = []
-
-    for i, text_encoder in enumerate(text_encoders):
-        if tokenizers is not None:
-            tokenizer = tokenizers[i]
-            text_input_ids = tokenize_prompt(tokenizer, prompt)
-        else:
-            assert text_input_ids_list is not None
-            text_input_ids = text_input_ids_list[i]
-
-        prompt_embeds = text_encoder(
-            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
-        )
-
-        # We are only ALWAYS interested in the pooled output of the final text encoder
-        pooled_prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds[-1][-2]
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
-        prompt_embeds_list.append(prompt_embeds)
-
-    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
-    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
-    return prompt_embeds, pooled_prompt_embeds
-
-
-def main(args):
-    logging_dir = Path(args.output_dir, args.logging_dir)
-
-    logging_dir = Path(args.output_dir, args.logging_dir)
-    gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name)
-    gaudi_config.use_torch_autocast = gaudi_config.use_torch_autocast or args.mixed_precision == "bf16"
-    accelerator = GaudiAccelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision=args.mixed_precision,
-        log_with=args.report_to,
-        project_dir=logging_dir,
-        force_autocast=gaudi_config.use_torch_autocast,
-    )
-    if args.report_to == "wandb":
-        if not is_wandb_available():
-            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
-        import wandb
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        transformers.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        transformers.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Generate class images if prior preservation is enabled.
-    if args.with_prior_preservation:
-        class_images_dir = Path(args.class_data_dir)
-        if not class_images_dir.exists():
-            class_images_dir.mkdir(parents=True)
-        cur_class_images = len(list(class_images_dir.iterdir()))
-
-        if cur_class_images < args.num_class_images:
-            torch_dtype = torch.bfloat16 if accelerator.device.type == "hpu" else torch.float32
-            if args.prior_generation_precision == "fp32":
-                torch_dtype = torch.float32
-            elif args.prior_generation_precision == "bf16":
-                torch_dtype = torch.bfloat16
-            pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
-                args.pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
-                revision=args.revision,
-                variant=args.variant,
-                use_hpu_graphs=args.use_hpu_graphs_for_inference,
-                use_habana=True,
-                gaudi_config=gaudi_config,
-            )
-            pipeline.set_progress_bar_config(disable=True)
-
-            num_new_images = args.num_class_images - cur_class_images
-            logger.info(f"Number of class images to sample: {num_new_images}.")
-
-            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
-
-            sample_dataloader = accelerator.prepare(sample_dataloader)
-            pipeline.to(accelerator.device)
-
-            for example in tqdm(
-                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
-            ):
-                images = pipeline(example["prompt"]).images
-
-                for i, image in enumerate(images):
-                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
-                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    image.save(image_filename)
-
-            del pipeline
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-        if args.push_to_hub:
-            repo_id = create_repo(
-                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
-            ).repo_id
-
-    # Load the tokenizers
-    tokenizer_one = AutoTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="tokenizer",
-        revision=args.revision,
-        use_fast=False,
-    )
-    tokenizer_two = AutoTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="tokenizer_2",
-        revision=args.revision,
-        use_fast=False,
-    )
-
-    # import correct text encoder classes
-    text_encoder_cls_one = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path, args.revision
-    )
-    text_encoder_cls_two = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
-    )
-
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder_one = text_encoder_cls_one.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
-    )
-    text_encoder_two = text_encoder_cls_two.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
-    )
-    vae_path = (
-        args.pretrained_model_name_or_path
-        if args.pretrained_vae_model_name_or_path is None
-        else args.pretrained_vae_model_name_or_path
-    )
-    vae = AutoencoderKL.from_pretrained(
-        vae_path,
-        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
-        revision=args.revision,
-        variant=args.variant,
-    )
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
-    )
-
-    # We only train the additional adapter LoRA layers
-    vae.requires_grad_(False)
-    text_encoder_one.requires_grad_(False)
-    text_encoder_two.requires_grad_(False)
-    unet.requires_grad_(False)
-
-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
-    # as these weights are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
-    # Move unet, vae and text_encoder to device and cast to weight_dtype
-    unet.to(accelerator.device, dtype=weight_dtype)
-
-    # The VAE is always in float32 to avoid NaN losses.
-    vae.to(accelerator.device, dtype=torch.float32)
-
-    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
-    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
-
-    if args.enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
-            import xformers
-
-            xformers_version = version.parse(xformers.__version__)
-            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, "
-                    "please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
-                )
-            unet.enable_xformers_memory_efficient_attention()
-        else:
-            raise ValueError("xformers is not available. Make sure it is installed correctly")
-
-    if args.gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-        if args.train_text_encoder:
-            text_encoder_one.gradient_checkpointing_enable()
-            text_encoder_two.gradient_checkpointing_enable()
-
-    # now we will add new LoRA weights to the attention layers
-    unet_lora_config = LoraConfig(
-        r=args.rank,
-        lora_alpha=args.rank,
-        init_lora_weights="gaussian",
-        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
-    )
-    unet.add_adapter(unet_lora_config)
-
-    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
-    # So, instead, we monkey-patch the forward calls of its attention-blocks.
-    if args.train_text_encoder:
-        text_lora_config = LoraConfig(
-            r=args.rank,
-            lora_alpha=args.rank,
-            init_lora_weights="gaussian",
-            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
-        )
-        text_encoder_one.add_adapter(text_lora_config)
-        text_encoder_two.add_adapter(text_lora_config)
-
-    def unwrap_model(model, training=False):
-        model = accelerator.unwrap_model(model)
-        model = model._orig_mod if is_compiled_module(model) else model
-        if not training:
-            return model
-        else:
-            if accelerator.distributed_type == GaudiDistributedType.MULTI_HPU:
-                kwargs = {}
-                kwargs["gradient_as_bucket_view"] = True
-                accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
-            if args.use_hpu_graphs_for_training:
-                if _is_peft_model(model):
-                    base_model = model.get_base_model()
-                    htcore.hpu.ModuleCacher()(model=base_model, inplace=True)
-                else:
-                    htcore.hpu.ModuleCacher()(model=model, inplace=True)
-            return model
-
-    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
-    def save_model_hook(models, weights, output_dir):
-        if accelerator.is_main_process:
-            # there are only two options here. Either are just the unet attn processor layers
-            # or there are the unet and text encoder atten layers
-            unet_lora_layers_to_save = None
-            text_encoder_one_lora_layers_to_save = None
-            text_encoder_two_lora_layers_to_save = None
-
-            for model in models:
-                if isinstance(model, type(unwrap_model(unet))):
-                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
-                elif isinstance(model, type(unwrap_model(text_encoder_one))):
-                    text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
-                        get_peft_model_state_dict(model)
-                    )
-                elif isinstance(model, type(unwrap_model(text_encoder_two))):
-                    text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
-                        get_peft_model_state_dict(model)
-                    )
-                else:
-                    raise ValueError(f"unexpected save model: {model.__class__}")
-
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
-
-            GaudiStableDiffusionXLPipeline.save_lora_weights(
-                output_dir,
-                unet_lora_layers=unet_lora_layers_to_save,
-                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
-                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
-            )
-
-    def load_model_hook(models, input_dir):
-        unet_ = None
-        text_encoder_one_ = None
-        text_encoder_two_ = None
-
-        while len(models) > 0:
-            model = models.pop()
-
-            if isinstance(model, type(unwrap_model(unet))):
-                unet_ = model
-            elif isinstance(model, type(unwrap_model(text_encoder_one))):
-                text_encoder_one_ = model
-            elif isinstance(model, type(unwrap_model(text_encoder_two))):
-                text_encoder_two_ = model
-            else:
-                raise ValueError(f"unexpected save model: {model.__class__}")
-
-        lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
-
-        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
-        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
-        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
-        if incompatible_keys is not None:
-            # check only for unexpected keys
-            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
-            if unexpected_keys:
-                logger.warning(
-                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
-                    f" {unexpected_keys}. "
-                )
-
-        if args.train_text_encoder:
-            # Do we need to call `scale_lora_layers()` here?
-            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
-
-            _set_state_dict_into_text_encoder(
-                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
-            )
-
-    accelerator.register_save_state_pre_hook(save_model_hook)
-    accelerator.register_load_state_pre_hook(load_model_hook)
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
-        )
-
-    unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
-
-    if args.train_text_encoder:
-        text_lora_parameters_one = list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
-        text_lora_parameters_two = list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
-
-    # Optimization parameters
-    unet_lora_parameters_with_lr = {"params": unet_lora_parameters, "lr": args.learning_rate}
-    if args.train_text_encoder:
-        # different learning rate for text encoder and unet
-        text_lora_parameters_one_with_lr = {
-            "params": text_lora_parameters_one,
-            "weight_decay": args.adam_weight_decay_text_encoder,
-            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
-        }
-        text_lora_parameters_two_with_lr = {
-            "params": text_lora_parameters_two,
-            "weight_decay": args.adam_weight_decay_text_encoder,
-            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
-        }
-        params_to_optimize = [
-            unet_lora_parameters_with_lr,
-            text_lora_parameters_one_with_lr,
-            text_lora_parameters_two_with_lr,
-        ]
-    else:
-        params_to_optimize = [unet_lora_parameters_with_lr]
-
-    # Optimizer creation
-    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
-        logger.warn(
-            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
-            "Defaulting to adamW"
-        )
-        args.optimizer = "adamw"
-
-    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
-        logger.warn(
-            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
-            f"set to {args.optimizer.lower()}"
-        )
-
-    if args.optimizer.lower() == "adamw":
-        if args.use_8bit_adam:
-            try:
-                import bitsandbytes as bnb
-            except ImportError:
-                raise ImportError(
-                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-                )
-
-            optimizer_class = bnb.optim.AdamW8bit
-        elif gaudi_config.use_fused_adam:
-            from habana_frameworks.torch.hpex.optimizers import FusedAdamW
-
-            optimizer_class = FusedAdamW
-        else:
-            optimizer_class = torch.optim.AdamW
-
-        optimizer = optimizer_class(
-            params_to_optimize,
-            betas=(args.adam_beta1, args.adam_beta2),
-            weight_decay=args.adam_weight_decay,
-            eps=args.adam_epsilon,
-        )
-
-    if args.optimizer.lower() == "prodigy":
-        try:
-            import prodigyopt
-        except ImportError:
-            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
-
-        optimizer_class = prodigyopt.Prodigy
-
-        if args.learning_rate <= 0.1:
-            logger.warn(
-                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
-            )
-        if args.train_text_encoder and args.text_encoder_lr:
-            logger.warn(
-                f"Learning rates were provided both for the unet and the text encoder- e.g. text_encoder_lr:"
-                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
-                f"When using prodigy only learning_rate is used as the initial learning rate."
-            )
-            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
-            # --learning_rate
-            params_to_optimize[1]["lr"] = args.learning_rate
-            params_to_optimize[2]["lr"] = args.learning_rate
-
-        optimizer = optimizer_class(
-            params_to_optimize,
-            lr=args.learning_rate,
-            betas=(args.adam_beta1, args.adam_beta2),
-            beta3=args.prodigy_beta3,
-            weight_decay=args.adam_weight_decay,
-            eps=args.adam_epsilon,
-            decouple=args.prodigy_decouple,
-            use_bias_correction=args.prodigy_use_bias_correction,
-            safeguard_warmup=args.prodigy_safeguard_warmup,
-        )
-
-    # Dataset and DataLoaders creation:
-    train_dataset = DreamBoothDataset(
-        instance_data_root=args.instance_data_dir,
-        instance_prompt=args.instance_prompt,
-        class_prompt=args.class_prompt,
-        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
-        class_num=args.num_class_images,
-        size=args.resolution,
-        repeats=args.repeats,
-        center_crop=args.center_crop,
-    )
-
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset,
-        batch_size=args.train_batch_size,
-        shuffle=True,
-        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
-        num_workers=args.dataloader_num_workers,
-    )
-
-    # Computes additional embeddings/ids required by the SDXL UNet.
-    # regular text embeddings (when `train_text_encoder` is not True)
-    # pooled text embeddings
-    # time ids
-
-    def compute_time_ids():
-        # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
-        original_size = (args.resolution, args.resolution)
-        target_size = (args.resolution, args.resolution)
-        crops_coords_top_left = (args.crops_coords_top_left_h, args.crops_coords_top_left_w)
-        add_time_ids = list(original_size + crops_coords_top_left + target_size)
-        add_time_ids = torch.tensor([add_time_ids])
-        add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
-        return add_time_ids
-
-    if not args.train_text_encoder:
-        tokenizers = [tokenizer_one, tokenizer_two]
-        text_encoders = [text_encoder_one, text_encoder_two]
-
-        def compute_text_embeddings(prompt, text_encoders, tokenizers):
-            with torch.no_grad():
-                prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
-                prompt_embeds = prompt_embeds.to(accelerator.device)
-                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
-            return prompt_embeds, pooled_prompt_embeds
-
-    # Handle instance prompt.
-    instance_time_ids = compute_time_ids()
-
-    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
-    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
-    # the redundant encoding.
-    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
-        instance_prompt_hidden_states, instance_pooled_prompt_embeds = compute_text_embeddings(
-            args.instance_prompt, text_encoders, tokenizers
-        )
-
-    # Handle class prompt for prior-preservation.
-    if args.with_prior_preservation:
-        class_time_ids = compute_time_ids()
-        if not args.train_text_encoder:
-            class_prompt_hidden_states, class_pooled_prompt_embeds = compute_text_embeddings(
-                args.class_prompt, text_encoders, tokenizers
-            )
-
-    # Clear the memory here
-    if not args.train_text_encoder and not train_dataset.custom_instance_prompts:
-        del tokenizers, text_encoders
-        gc.collect()
-
-    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
-    # pack the statically computed variables appropriately here. This is so that we don't
-    # have to pass them to the dataloader.
-    add_time_ids = instance_time_ids
-    if args.with_prior_preservation:
-        add_time_ids = torch.cat([add_time_ids, class_time_ids], dim=0)
-
-    if not train_dataset.custom_instance_prompts:
-        if not args.train_text_encoder:
-            prompt_embeds = instance_prompt_hidden_states
-            unet_add_text_embeds = instance_pooled_prompt_embeds
-            if args.with_prior_preservation:
-                prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
-                unet_add_text_embeds = torch.cat([unet_add_text_embeds, class_pooled_prompt_embeds], dim=0)
-        # if we're optmizing the text encoder (both if instance prompt is used for all images or custom prompts) we need to tokenize and encode the
-        # batch prompts on all training steps
-        else:
-            tokens_one = tokenize_prompt(tokenizer_one, args.instance_prompt)
-            tokens_two = tokenize_prompt(tokenizer_two, args.instance_prompt)
-            if args.with_prior_preservation:
-                class_tokens_one = tokenize_prompt(tokenizer_one, args.class_prompt)
-                class_tokens_two = tokenize_prompt(tokenizer_two, args.class_prompt)
-                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
-                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-        num_training_steps=args.max_train_steps * accelerator.num_processes,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-
-    # Prepare everything with our `accelerator`.
-    if args.train_text_encoder:
-        unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
-        )
-    else:
-        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, optimizer, train_dataloader, lr_scheduler
-        )
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args))
-
-    unwrap_model(model=unet, training=True)
-    if args.train_text_encoder:
-        unwrap_model(model=text_encoder_one, training=True)
-        unwrap_model(model=text_encoder_two, training=True)
-    # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    global_step = 0
-    first_epoch = 0
-
-    # Potentially load in the weights and states from a previous save
-    if args.resume_from_checkpoint:
-        if args.resume_from_checkpoint != "latest":
-            path = os.path.basename(args.resume_from_checkpoint)
-        else:
-            # Get the mos recent checkpoint
-            dirs = os.listdir(args.output_dir)
-            dirs = [d for d in dirs if d.startswith("checkpoint")]
-            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
-
-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-            initial_global_step = 0
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            initial_global_step = global_step
-            first_epoch = global_step // num_update_steps_per_epoch
-
-    else:
-        initial_global_step = 0
-
-    progress_bar = tqdm(
-        range(0, args.max_train_steps),
-        initial=initial_global_step,
-        desc="Steps",
-        # Only show the progress bar once on each machine.
-        disable=not accelerator.is_local_main_process,
-    )
-
-    for epoch in range(first_epoch, args.num_train_epochs):
-        unet.train()
-        if args.train_text_encoder:
-            text_encoder_one.train()
-            text_encoder_two.train()
-
-            # set top parameter requires_grad = True for gradient checkpointing works
-            accelerator.unwrap_model(text_encoder_one).text_model.embeddings.requires_grad_(True)
-            accelerator.unwrap_model(text_encoder_two).text_model.embeddings.requires_grad_(True)
-
-        for step, batch in enumerate(train_dataloader):
-            with accelerator.accumulate(unet):
-                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
-                prompts = batch["prompts"]
-
-                # encode batch prompts when custom prompts are provided for each image -
-                if train_dataset.custom_instance_prompts:
-                    if not args.train_text_encoder:
-                        prompt_embeds, unet_add_text_embeds = compute_text_embeddings(
-                            prompts, text_encoders, tokenizers
-                        )
-                    else:
-                        tokens_one = tokenize_prompt(tokenizer_one, prompts)
-                        tokens_two = tokenize_prompt(tokenizer_two, prompts)
-
-                # Convert images to latent space
-                model_input = vae.encode(pixel_values).latent_dist.sample()
-                model_input = model_input * vae.config.scaling_factor
-                if args.pretrained_vae_model_name_or_path is None:
-                    model_input = model_input.to(weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(model_input)
-                bsz = model_input.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
-                )
-                timesteps = timesteps.long()
-
-                # Add noise to the model input according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
-
-                # Calculate the elements to repeat depending on the use of prior-preservation and custom captions.
-                if not train_dataset.custom_instance_prompts:
-                    elems_to_repeat_text_embeds = bsz // 2 if args.with_prior_preservation else bsz
-                    elems_to_repeat_time_ids = bsz // 2 if args.with_prior_preservation else bsz
-                else:
-                    elems_to_repeat_text_embeds = 1
-                    elems_to_repeat_time_ids = bsz // 2 if args.with_prior_preservation else bsz
-
-                # Predict the noise residual
-                if not args.train_text_encoder:
-                    unet_added_conditions = {
-                        "time_ids": add_time_ids.repeat(elems_to_repeat_time_ids, 1),
-                        "text_embeds": unet_add_text_embeds.repeat(elems_to_repeat_text_embeds, 1),
-                    }
-                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
-                    model_pred = unet(
-                        noisy_model_input,
-                        timesteps,
-                        prompt_embeds_input,
-                        added_cond_kwargs=unet_added_conditions,
-                        return_dict=False,
-                    )[0]
-                else:
-                    unet_added_conditions = {"time_ids": add_time_ids.repeat(elems_to_repeat_time_ids, 1)}
-                    prompt_embeds, pooled_prompt_embeds = encode_prompt(
-                        text_encoders=[text_encoder_one, text_encoder_two],
-                        tokenizers=None,
-                        prompt=None,
-                        text_input_ids_list=[tokens_one, tokens_two],
-                    )
-                    unet_added_conditions.update(
-                        {"text_embeds": pooled_prompt_embeds.repeat(elems_to_repeat_text_embeds, 1)}
-                    )
-                    prompt_embeds_input = prompt_embeds.repeat(elems_to_repeat_text_embeds, 1, 1)
-                    model_pred = unet(
-                        noisy_model_input,
-                        timesteps,
-                        prompt_embeds_input,
-                        added_cond_kwargs=unet_added_conditions,
-                        return_dict=False,
-                    )[0]
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                if args.with_prior_preservation:
-                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
-                    target, target_prior = torch.chunk(target, 2, dim=0)
-
-                    # Compute prior loss
-                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
-
-                if args.snr_gamma is None:
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-                else:
-                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
-                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
-                    # This is discussed in Section 4.2 of the same paper.
-                    snr = compute_snr(noise_scheduler, timesteps)
-                    base_weight = (
-                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
-                    )
-
-                    if noise_scheduler.config.prediction_type == "v_prediction":
-                        # Velocity objective needs to be floored to an SNR weight of one.
-                        mse_loss_weights = base_weight + 1
-                    else:
-                        # Epsilon and sample both use the same loss weights.
-                        mse_loss_weights = base_weight
-
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
-                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
-                    loss = loss.mean()
-
-                if args.with_prior_preservation:
-                    # Add the prior loss to the instance loss.
-                    loss = loss + args.prior_loss_weight * prior_loss
-
-                accelerator.backward(loss)
-                htcore.mark_step()
-                if accelerator.sync_gradients:
-                    params_to_clip = (
-                        itertools.chain(unet_lora_parameters, text_lora_parameters_one, text_lora_parameters_two)
-                        if args.train_text_encoder
-                        else unet_lora_parameters
-                    )
-                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad(set_to_none=True)
-                htcore.mark_step()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-                if accelerator.is_main_process:
-                    if global_step % args.checkpointing_steps == 0:
-                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
-                        if args.checkpoints_total_limit is not None:
-                            checkpoints = os.listdir(args.output_dir)
-                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
-                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
-
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
-                            if len(checkpoints) >= args.checkpoints_total_limit:
-                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
-                                removing_checkpoints = checkpoints[0:num_to_remove]
-
-                                logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
-                                )
-                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
-
-                                for removing_checkpoint in removing_checkpoints:
-                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
-                                    shutil.rmtree(removing_checkpoint)
-
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
-                        logger.info(f"Saved state to {save_path}")
-
-            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-            accelerator.log(logs, step=global_step)
-
-            if global_step >= args.max_train_steps:
-                break
-
-        if accelerator.is_main_process:
-            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
-                logger.info(
-                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}."
-                )
-                # create pipeline
-                pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
-                    args.pretrained_model_name_or_path,
-                    vae=vae,
-                    text_encoder=accelerator.unwrap_model(text_encoder_one),
-                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
-                    unet=accelerator.unwrap_model(unet),
-                    revision=args.revision,
-                    variant=args.variant,
-                    torch_dtype=weight_dtype,
-                    use_hpu_graphs=args.use_hpu_graphs_for_inference,
-                    use_habana=True,
-                    gaudi_config=gaudi_config,
-                )
-                pipeline.text_encoder.eval()
-                pipeline.text_encoder_2.eval()
-                pipeline.unet.eval()
-
-                # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
-                scheduler_args = {}
-
-                if "variance_type" in pipeline.scheduler.config:
-                    variance_type = pipeline.scheduler.config.variance_type
-
-                    if variance_type in ["learned", "learned_range"]:
-                        variance_type = "fixed_small"
-
-                    scheduler_args["variance_type"] = variance_type
-
-                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-                    pipeline.scheduler.config, **scheduler_args
-                )
-
-                pipeline = pipeline.to(accelerator.device)
-                pipeline.set_progress_bar_config(disable=True)
-
-                # run inference
-                if args.seed is not None:
-                    if accelerator.device == torch.device("hpu"):
-                        # torch.Generator() is unsupported on HPU
-                        generator = set_seed(args.seed)
-                    else:
-                        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
-                else:
-                    generator = None
-                pipeline_args = {"prompt": args.validation_prompt}
-
-                images = [
-                    pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images)
-                ]
-
-                for tracker in accelerator.trackers:
-                    if tracker.name == "tensorboard":
-                        np_images = np.stack([np.asarray(img) for img in images])
-                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
-                    if tracker.name == "wandb":
-                        tracker.log(
-                            {
-                                "validation": [
-                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                                    for i, image in enumerate(images)
-                                ]
-                            }
-                        )
-                pipeline.unet.train()
-                if args.train_text_encoder:
-                    pipeline.text_encoder.train()
-                    pipeline.text_encoder_2.train()
-                del pipeline
-
-    # Save the lora layers
-    accelerator.wait_for_everyone()
-    if accelerator.is_main_process:
-        unet = unwrap_model(unet)
-        unet = unet.to(torch.float32)
-        unet_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
-
-        if args.train_text_encoder:
-            text_encoder_one = unwrap_model(text_encoder_one)
-            text_encoder_lora_layers = convert_state_dict_to_diffusers(
-                get_peft_model_state_dict(text_encoder_one.to(torch.float32))
-            )
-            text_encoder_two = unwrap_model(text_encoder_two)
-            text_encoder_2_lora_layers = convert_state_dict_to_diffusers(
-                get_peft_model_state_dict(text_encoder_two.to(torch.float32))
-            )
-        else:
-            text_encoder_lora_layers = None
-            text_encoder_2_lora_layers = None
-
-        GaudiStableDiffusionXLPipeline.save_lora_weights(
-            save_directory=args.output_dir,
-            unet_lora_layers=unet_lora_layers,
-            text_encoder_lora_layers=text_encoder_lora_layers,
-            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
-        )
-        # Final inference
-        # Load previous pipeline
-        vae = AutoencoderKL.from_pretrained(
-            vae_path,
-            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
-            revision=args.revision,
-            variant=args.variant,
-            torch_dtype=weight_dtype,
-        )
-        pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            vae=vae,
-            revision=args.revision,
-            variant=args.variant,
-            torch_dtype=weight_dtype,
-            use_hpu_graphs=args.use_hpu_graphs_for_inference,
-            use_habana=True,
-            gaudi_config=gaudi_config,
-        )
-
-        # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
-        scheduler_args = {}
-
-        if "variance_type" in pipeline.scheduler.config:
-            variance_type = pipeline.scheduler.config.variance_type
-
-            if variance_type in ["learned", "learned_range"]:
-                variance_type = "fixed_small"
-
-            scheduler_args["variance_type"] = variance_type
-
-        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
-
-        # load attention processors
-        pipeline.load_lora_weights(args.output_dir)
-
-        # run inference
-        images = []
-        if args.validation_prompt and args.num_validation_images > 0:
-            pipeline = pipeline.to(accelerator.device)
-            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
-            images = [
-                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
-                for _ in range(args.num_validation_images)
-            ]
-
-            for tracker in accelerator.trackers:
-                if tracker.name == "tensorboard":
-                    np_images = np.stack([np.asarray(img) for img in images])
-                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
-                if tracker.name == "wandb":
-                    tracker.log(
-                        {
-                            "test": [
-                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                                for i, image in enumerate(images)
-                            ]
-                        }
-                    )
-
-        if args.push_to_hub:
-            save_model_card(
-                repo_id,
-                images=images,
-                base_model=args.pretrained_model_name_or_path,
-                train_text_encoder=args.train_text_encoder,
-                instance_prompt=args.instance_prompt,
-                validation_prompt=args.validation_prompt,
-                repo_folder=args.output_dir,
-                vae_path=args.pretrained_vae_model_name_or_path,
-            )
-            upload_folder(
-                repo_id=repo_id,
-                folder_path=args.output_dir,
-                commit_message="End of training",
-                ignore_patterns=["step_*", "epoch_*"],
-            )
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
deleted file mode 100644
index 9e7503867f..0000000000
--- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py
+++ /dev/null
@@ -1,1554 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning script for Stable Diffusion models for text2image.
-Adapted from the following sources:
-https://github.com/huggingface/diffusers/blob/v0.25.1/examples/text_to_image/train_text_to_image_sdxl.py
-"""
-
-import argparse
-import functools
-import gc
-import json
-import logging
-import math
-import os
-import random
-import shutil
-import time
-from pathlib import Path
-
-import accelerate
-import datasets
-import diffusers
-import habana_frameworks.torch as htorch
-import habana_frameworks.torch.core as htcore
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-import transformers
-from accelerate.logging import get_logger
-from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration
-from datasets import load_dataset
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import EMAModel, compute_snr
-from diffusers.utils import check_min_version, is_wandb_available
-from diffusers.utils.torch_utils import is_compiled_module
-from huggingface_hub import create_repo, upload_folder
-from packaging import version
-from torchvision import transforms
-from torchvision.transforms.functional import crop
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-from optimum.habana import GaudiConfig
-from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.accelerate.utils.dataclasses import GaudiDistributedType
-from optimum.habana.diffusers import (
-    GaudiEulerDiscreteScheduler,
-    GaudiStableDiffusionXLPipeline,
-)
-from optimum.habana.utils import HabanaProfile, set_seed, to_gb_rounded
-
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
-
-logger = get_logger(__name__, log_level="INFO")
-
-DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
-}
-
-
-def save_model_card(
-    repo_id: str,
-    images=None,
-    validation_prompt=None,
-    base_model=str,
-    dataset_name=str,
-    repo_folder=None,
-    vae_path=None,
-):
-    img_str = ""
-    for i, image in enumerate(images):
-        image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"![img_{i}](./image_{i}.png)\n"
-
-    yaml = f"""
----
-license: creativeml-openrail-m
-base_model: {base_model}
-dataset: {dataset_name}
-tags:
-- stable-diffusion-xl
-- stable-diffusion-xl-diffusers
-- text-to-image
-- diffusers
-inference: true
----
-    """
-    model_card = f"""
-# Text-to-image finetuning - {repo_id}
-
-This pipeline was finetuned from **{base_model}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompt: {validation_prompt}: \n
-{img_str}
-
-Special VAE used for training: {vae_path}.
-"""
-    with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)
-
-
-def import_model_class_from_model_name_or_path(
-    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
-):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "CLIPTextModelWithProjection":
-        from transformers import CLIPTextModelWithProjection
-
-        return CLIPTextModelWithProjection
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--pretrained_vae_model_name_or_path",
-        type=str,
-        default=None,
-        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--variant",
-        type=str,
-        default=None,
-        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help=(
-            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
-            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand."
-        ),
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The config of the Dataset, leave as None if there's only one config.",
-    )
-    parser.add_argument(
-        "--train_data_dir",
-        type=str,
-        default=None,
-        help=(
-            "A folder containing the training data. Folder contents must follow the structure described in"
-            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
-            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ),
-    )
-    parser.add_argument(
-        "--image_column",
-        type=str,
-        default="image",
-        help="The column of the dataset containing an image.",
-    )
-    parser.add_argument(
-        "--caption_column",
-        type=str,
-        default="text",
-        help="The column of the dataset containing a caption or a list of captions.",
-    )
-    parser.add_argument(
-        "--validation_prompt",
-        type=str,
-        default=None,
-        help="A prompt that is used during validation to verify that the model is learning.",
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images that should be generated during validation with `validation_prompt`.",
-    )
-    parser.add_argument(
-        "--validation_epochs",
-        type=int,
-        default=1,
-        help=(
-            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`."
-        ),
-    )
-    parser.add_argument(
-        "--max_train_samples",
-        type=int,
-        default=None,
-        help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        ),
-    )
-    parser.add_argument(
-        "--proportion_empty_prompts",
-        type=float,
-        default=0,
-        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="sdxl-model-finetuned",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        type=str,
-        default=None,
-        help="The directory where the downloaded models and datasets will be stored.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=1024,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution."
-        ),
-    )
-    parser.add_argument(
-        "--crop_resolution",
-        type=int,
-        default=1024,
-        help=(
-            "The resolution for cropping input images, all the images in the train/validation dataset will be resized to this"
-            " resolution."
-        ),
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument(
-        "--random_flip",
-        action="store_true",
-        help="whether to randomly flip images horizontally.",
-    )
-    parser.add_argument(
-        "--train_batch_size",
-        type=int,
-        default=16,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=100)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=(
-            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
-            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
-            " training using `--resume_from_checkpoint`."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoints_total_limit",
-        type=int,
-        default=None,
-        help=("Max number of checkpoints to store."),
-    )
-    parser.add_argument(
-        "--resume_from_checkpoint",
-        type=str,
-        default=None,
-        help=(
-            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
-            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
-        ),
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=1e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps",
-        type=int,
-        default=500,
-        help="Number of steps for the warmup in the lr scheduler.",
-    )
-    parser.add_argument(
-        "--timestep_bias_strategy",
-        type=str,
-        default="none",
-        choices=["earlier", "later", "range", "none"],
-        help=(
-            "The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
-            " Choices: ['earlier', 'later', 'range', 'none']."
-            " The default is 'none', which means no bias is applied, and training proceeds normally."
-            " The value of 'later' will increase the frequency of the model's final training timesteps."
-        ),
-    )
-    parser.add_argument(
-        "--timestep_bias_multiplier",
-        type=float,
-        default=1.0,
-        help=(
-            "The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
-            " A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
-        ),
-    )
-    parser.add_argument(
-        "--timestep_bias_begin",
-        type=int,
-        default=0,
-        help=(
-            "When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
-            " Defaults to zero, which equates to having no specific bias."
-        ),
-    )
-    parser.add_argument(
-        "--timestep_bias_end",
-        type=int,
-        default=1000,
-        help=(
-            "When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
-            " Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
-        ),
-    )
-    parser.add_argument(
-        "--timestep_bias_portion",
-        type=float,
-        default=0.25,
-        help=(
-            "The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
-            " A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
-            " whether the biased portions are in the earlier or later timesteps."
-        ),
-    )
-    parser.add_argument(
-        "--snr_gamma",
-        type=float,
-        default=None,
-        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
-        "More details here: https://arxiv.org/abs/2303.09556.",
-    )
-    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument(
-        "--adam_beta1",
-        type=float,
-        default=0.9,
-        help="The beta1 parameter for the Adam optimizer.",
-    )
-    parser.add_argument(
-        "--adam_beta2",
-        type=float,
-        default=0.999,
-        help="The beta2 parameter for the Adam optimizer.",
-    )
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument(
-        "--adam_epsilon",
-        type=float,
-        default=1e-08,
-        help="Epsilon value for the Adam optimizer",
-    )
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the model to the Hub.",
-    )
-    parser.add_argument(
-        "--hub_token",
-        type=str,
-        default=None,
-        help="The token to use to push to the Model Hub.",
-    )
-    parser.add_argument(
-        "--prediction_type",
-        type=str,
-        default=None,
-        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
-    )
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="tensorboard",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
-            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
-        ),
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        default=False,
-        help=("Whether to use bf16 mixed precision."),
-    )
-    parser.add_argument(
-        "--local_rank",
-        type=int,
-        default=-1,
-        help="For distributed training: local_rank",
-    )
-    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-    parser.add_argument(
-        "--gaudi_config_name",
-        type=str,
-        default=None,
-        help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.",
-    )
-    parser.add_argument(
-        "--throughput_warmup_steps",
-        type=int,
-        default=0,
-        help=(
-            "Number of steps to ignore for throughput calculation. For example, with throughput_warmup_steps=N, the"
-            " first N steps will not be considered in the calculation of the throughput. This is especially useful in"
-            " lazy mode."
-        ),
-    )
-    parser.add_argument(
-        "--use_hpu_graphs_for_training",
-        action="store_true",
-        help="Use HPU graphs for training on HPU.",
-    )
-    parser.add_argument(
-        "--use_hpu_graphs_for_inference",
-        action="store_true",
-        help="Use HPU graphs for inference on HPU.",
-    )
-
-    parser.add_argument(
-        "--image_save_dir",
-        type=str,
-        default="./stable-diffusion-generated-images",
-        help="The directory where images will be saved.",
-    )
-    parser.add_argument(
-        "--output_type",
-        type=str,
-        choices=["pil", "np"],
-        default="pil",
-        help="Whether to return PIL images or Numpy arrays.",
-    )
-    parser.add_argument(
-        "--profiling_warmup_steps",
-        default=0,
-        type=int,
-        help="Number of steps to ignore for profiling.",
-    )
-    parser.add_argument(
-        "--profiling_steps",
-        default=0,
-        type=int,
-        help="Number of steps to capture for profiling.",
-    )
-    parser.add_argument(
-        "--logging_step",
-        default=1,
-        type=int,
-        help="Print the loss for every logging_step.",
-    )
-    parser.add_argument(
-        "--mediapipe",
-        default="",
-        type=str,
-        help="Use gaudi2 HW mediapipe over regular dataloader. \
-        case 1: nothing is passed to this argument -> regular torch dataloader is used\
-        case 2: an empty or non existant path is passed -> images are dumped from dataset (passed in through dataset_name) in that location before first run \
-        case 3: a non empty path is passed -> images from that location are used ",
-    )
-    parser.add_argument(
-        "--adjust_throughput",
-        default=False,
-        action="store_true",
-        help="Checkpoint saving takes a lot of time. Ignore time for checkpoint saving for throughput calculations",
-    )
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    # Sanity checks
-    if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError("Need either a dataset name or a training folder.")
-
-    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
-        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
-
-    return args
-
-
-# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
-def encode_prompt(
-    batch,
-    text_encoders,
-    tokenizers,
-    proportion_empty_prompts,
-    caption_column,
-    is_train=True,
-):
-    prompt_embeds_list = []
-    prompt_batch = batch[caption_column]
-
-    captions = []
-    for caption in prompt_batch:
-        if random.random() < proportion_empty_prompts:
-            captions.append("")
-        elif isinstance(caption, str):
-            captions.append(caption)
-        elif isinstance(caption, (list, np.ndarray)):
-            # take a random caption if there are multiple
-            captions.append(random.choice(caption) if is_train else caption[0])
-
-    with torch.no_grad():
-        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-            text_inputs = tokenizer(
-                captions,
-                padding="max_length",
-                max_length=tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            prompt_embeds = text_encoder(
-                text_input_ids.to(text_encoder.device),
-                output_hidden_states=True,
-                return_dict=False,
-            )
-
-            # We are only ALWAYS interested in the pooled output of the final text encoder
-            pooled_prompt_embeds = prompt_embeds[0]
-            prompt_embeds = prompt_embeds[-1][-2]
-            bs_embed, seq_len, _ = prompt_embeds.shape
-            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
-            prompt_embeds_list.append(prompt_embeds)
-    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
-    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
-    # map creates cache in cpu so need to change tensor to float32
-    return {
-        "prompt_embeds": prompt_embeds.to(torch.float32),
-        "pooled_prompt_embeds": pooled_prompt_embeds.to(torch.float32),
-    }
-
-
-def compute_vae_encodings(pixel_values, vae):
-    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-    pixel_values = pixel_values.to(vae.device, dtype=vae.dtype)
-    with torch.no_grad():
-        model_input = vae.encode(pixel_values).latent_dist.sample()
-    model_input = model_input * vae.config.scaling_factor
-    return model_input
-
-
-def generate_timestep_weights(args, num_timesteps):
-    weights = torch.ones(num_timesteps)
-
-    # Determine the indices to bias
-    num_to_bias = int(args.timestep_bias_portion * num_timesteps)
-
-    if args.timestep_bias_strategy == "later":
-        bias_indices = slice(-num_to_bias, None)
-    elif args.timestep_bias_strategy == "earlier":
-        bias_indices = slice(0, num_to_bias)
-    elif args.timestep_bias_strategy == "range":
-        # Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
-        range_begin = args.timestep_bias_begin
-        range_end = args.timestep_bias_end
-        if range_begin < 0:
-            raise ValueError(
-                "When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
-            )
-        if range_end > num_timesteps:
-            raise ValueError(
-                "When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
-            )
-        bias_indices = slice(range_begin, range_end)
-    else:  # 'none' or any other string
-        return weights
-    if args.timestep_bias_multiplier <= 0:
-        return ValueError(
-            "The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
-            " If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
-            " A timestep bias multiplier less than or equal to 0 is not allowed."
-        )
-
-    # Apply the bias
-    weights[bias_indices] *= args.timestep_bias_multiplier
-
-    # Normalize
-    weights /= weights.sum()
-
-    return weights
-
-
-def main(args):
-    logging_dir = Path(args.output_dir, args.logging_dir)
-
-    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
-
-    gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name)
-    gaudi_config.use_torch_autocast = gaudi_config.use_torch_autocast or args.bf16
-    accelerator = GaudiAccelerator(
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        mixed_precision="bf16" if gaudi_config.use_torch_autocast else "no",
-        log_with=args.report_to,
-        project_config=accelerator_project_config,
-        force_autocast=gaudi_config.use_torch_autocast,
-    )
-
-    if args.report_to == "wandb":
-        if not is_wandb_available():
-            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
-        import wandb
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-        if args.push_to_hub:
-            repo_id = create_repo(
-                repo_id=args.hub_model_id or Path(args.output_dir).name,
-                exist_ok=True,
-                token=args.hub_token,
-            ).repo_id
-
-    # Load the tokenizers
-    tokenizer_one = AutoTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="tokenizer",
-        revision=args.revision,
-        use_fast=False,
-    )
-    tokenizer_two = AutoTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="tokenizer_2",
-        revision=args.revision,
-        use_fast=False,
-    )
-
-    # import correct text encoder classes
-    text_encoder_cls_one = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path, args.revision
-    )
-    text_encoder_cls_two = import_model_class_from_model_name_or_path(
-        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
-    )
-
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
-    # Check for terminal SNR in combination with SNR Gamma
-    text_encoder_one = text_encoder_cls_one.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-        variant=args.variant,
-    ).to(accelerator.device)
-    text_encoder_two = text_encoder_cls_two.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder_2",
-        revision=args.revision,
-        variant=args.variant,
-    ).to(accelerator.device)
-
-    # For mixed precision training we cast all non-trainable weigths to half-precision
-    # as these weights are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if gaudi_config.use_torch_autocast:
-        weight_dtype = torch.bfloat16
-
-    vae_path = (
-        args.pretrained_model_name_or_path
-        if args.pretrained_vae_model_name_or_path is None
-        else args.pretrained_vae_model_name_or_path
-    )
-
-    vae = AutoencoderKL.from_pretrained(
-        vae_path,
-        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
-        revision=args.revision,
-        variant=args.variant,
-    )
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-        revision=args.revision,
-        variant=args.variant,
-        torch_dtype=weight_dtype,
-    )
-
-    # Freeze vae and text encoders.
-    vae.requires_grad_(False)
-    text_encoder_one.requires_grad_(False)
-    text_encoder_two.requires_grad_(False)
-    # Set unet as trainable.
-    unet.train()
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
-        )
-
-    # Initialize the optimizer
-    if gaudi_config.use_fused_adam:
-        from habana_frameworks.torch.hpex.optimizers import FusedAdamW
-
-        optimizer_class = FusedAdamW
-    else:
-        optimizer_class = torch.optim.AdamW
-
-    if gaudi_config.use_fused_clip_norm:
-        from habana_frameworks.torch.hpex.normalization import FusedClipNorm
-
-        fused_clip_norm = FusedClipNorm(unet.parameters(), args.max_grad_norm)
-
-    # Optimizer creation
-    params_to_optimize = unet.parameters()
-    optimizer = optimizer_class(
-        params_to_optimize,
-        lr=args.learning_rate,
-        betas=(args.adam_beta1, args.adam_beta2),
-        weight_decay=args.adam_weight_decay,
-        eps=args.adam_epsilon,
-    )
-
-    # Get the datasets: you can either provide your own training and evaluation files (see below)
-    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
-
-    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        if len(args.mediapipe) > 0:
-            assert (
-                args.resolution == args.crop_resolution
-            ), f"To use hardware pipe, --resolution ({args.resolution}) must equal --crop_resolution ({args.crop_resolution})"
-            if args.local_rank == 0:
-                if not os.path.exists(args.mediapipe):
-                    os.mkdir(args.mediapipe)
-                if len(os.listdir(args.mediapipe)) == 0:
-                    dataset = load_dataset(args.dataset_name, None)
-                    with open(f"{args.mediapipe}/label.txt", "w") as f:
-                        for idx, dt in enumerate(dataset["train"]):
-                            dt["image"].save(f"{args.mediapipe}/{idx}.jpg")
-                            f.write(dt["text"] + "\n")
-            if accelerator.distributed_type != GaudiDistributedType.NO:
-                torch.distributed.barrier()
-
-            from media_pipe_imgdir import get_dataset_for_pipeline
-
-            dt = get_dataset_for_pipeline(args.mediapipe)
-            dataset = {"train": dt}
-        else:
-            # Downloading and loading a dataset from the hub.
-            dataset = load_dataset(
-                args.dataset_name,
-                args.dataset_config_name,
-                cache_dir=args.cache_dir,
-            )
-    else:
-        data_files = {}
-        if args.train_data_dir is not None:
-            data_files["train"] = os.path.join(args.train_data_dir, "**")
-        dataset = load_dataset(
-            "imagefolder",
-            data_files=data_files,
-            cache_dir=args.cache_dir,
-        )
-        # See more about loading custom images at
-        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    column_names = dataset["train"].column_names
-
-    # 6. Get the column names for input/target.
-    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
-    if args.image_column is None:
-        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        image_column = args.image_column
-        if image_column not in column_names:
-            raise ValueError(
-                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if args.caption_column is None:
-        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        caption_column = args.caption_column
-        if caption_column not in column_names:
-            raise ValueError(
-                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    text_encoder_one = text_encoder_one.to(accelerator.device, dtype=weight_dtype)
-    text_encoder_two = text_encoder_two.to(accelerator.device, dtype=weight_dtype)
-    # Preprocessing the datasets.
-    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
-    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
-    train_flip = transforms.RandomHorizontalFlip(p=1.0)
-    train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
-
-    vae = vae.to(accelerator.device, dtype=weight_dtype)
-    # Let's first compute all the embeddings so that we can free up the text encoders
-    # from memory. We will pre-compute the VAE encodings too.
-    text_encoders = [text_encoder_one, text_encoder_two]
-    tokenizers = [tokenizer_one, tokenizer_two]
-
-    def preprocess_train(examples):
-        images = [image.convert("RGB") for image in examples[image_column]]
-        # image aug
-        original_sizes = []
-        all_images = []
-        crop_top_lefts = []
-        for image in images:
-            original_sizes.append((image.height, image.width))
-            image = train_resize(image)
-            if args.crop_resolution < args.resolution:
-                if args.center_crop:
-                    y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
-                    x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
-                    image = train_crop(image)
-                else:
-                    y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
-                    image = crop(image, y1, x1, h, w)
-            else:
-                x1 = 0
-                y1 = 0
-            if args.random_flip and random.random() < 0.5:
-                # flip
-                x1 = image.width - x1
-                image = train_flip(image)
-            crop_top_left = (y1, x1)
-            crop_top_lefts.append(crop_top_left)
-            image = train_transforms(image)
-            all_images.append(image)
-
-        examples["original_sizes"] = original_sizes
-        examples["crop_top_lefts"] = crop_top_lefts
-        examples["pixel_values"] = all_images
-        return examples
-
-    with accelerator.main_process_first():
-        if args.max_train_samples is not None:
-            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
-        train_dataset = dataset["train"]
-        if len(args.mediapipe) == 0:
-            # Set the training transforms
-            train_dataset = train_dataset.with_transform(preprocess_train)
-
-    compute_embeddings_fn = functools.partial(
-        encode_prompt,
-        text_encoders=text_encoders,
-        tokenizers=tokenizers,
-        proportion_empty_prompts=args.proportion_empty_prompts,
-        caption_column=args.caption_column,
-    )
-
-    # TODO : adding crop = (0,0) for now.
-    # If we do random crop, we have to do this in mediapipe
-    def attach_metadata(batch):
-        import imagesize
-
-        return {
-            "original_sizes": imagesize.get(batch["image"]),
-            "crop_top_lefts": (0, 0),
-        }
-
-    with accelerator.main_process_first():
-        from datasets.fingerprint import Hasher
-
-        # fingerprint used by the cache for the other processes to load the result
-        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
-        new_fingerprint = Hasher.hash(args)
-        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
-        if len(args.mediapipe) > 0:
-            train_dataset = train_dataset.map(attach_metadata, load_from_cache_file=False)
-
-    def collate_fn(examples):
-        pixel_values = torch.stack([example["pixel_values"].clone().detach() for example in examples])
-        original_sizes = [example["original_sizes"] for example in examples]
-        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
-        prompt_embeds = torch.stack([torch.tensor(example["prompt_embeds"]) for example in examples])
-        pooled_prompt_embeds = torch.stack([torch.tensor(example["pooled_prompt_embeds"]) for example in examples])
-
-        return {
-            "pixel_values": pixel_values,
-            "prompt_embeds": prompt_embeds,
-            "pooled_prompt_embeds": pooled_prompt_embeds,
-            "original_sizes": original_sizes,
-            "crop_top_lefts": crop_top_lefts,
-        }
-
-    # DataLoaders creation:
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset,
-        shuffle=True,
-        collate_fn=collate_fn,
-        batch_size=args.train_batch_size,
-        num_workers=args.dataloader_num_workers,
-    )
-
-    del text_encoders, tokenizers
-    gc.collect()
-    # Create EMA for the unet.
-    if args.use_ema:
-        ema_unet = UNet2DConditionModel.from_pretrained(
-            args.pretrained_model_name_or_path,
-            subfolder="unet",
-            revision=args.revision,
-            variant=args.variant,
-        )
-        ema_unet = EMAModel(
-            ema_unet.parameters(),
-            model_cls=UNet2DConditionModel,
-            model_config=ema_unet.config,
-        )
-
-    # `accelerate` 0.16.0 will have better support for customized saving
-    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
-        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
-        def save_model_hook(models, weights, output_dir):
-            if accelerator.is_main_process:
-                if args.use_ema:
-                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
-
-                for i, model in enumerate(models):
-                    model.save_pretrained(os.path.join(output_dir, "unet"))
-
-                    # make sure to pop weight so that corresponding model is not saved again
-                    weights.pop()
-
-        def load_model_hook(models, input_dir):
-            if args.use_ema:
-                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
-                ema_unet.load_state_dict(load_model.state_dict())
-                ema_unet.to(accelerator.device)
-                del load_model
-
-            for i in range(len(models)):
-                # pop models so that they are not loaded again
-                model = models.pop()
-
-                # load diffusers style into model
-                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
-                model.register_to_config(**load_model.config)
-
-                model.load_state_dict(load_model.state_dict())
-                del load_model
-
-        accelerator.register_save_state_pre_hook(save_model_hook)
-        accelerator.register_load_state_pre_hook(load_model_hook)
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-    )
-
-    unet = unet.to("hpu")
-    # Prepare everything with our `accelerator`.
-    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        unet, optimizer, train_dataloader, lr_scheduler
-    )
-
-    if len(args.mediapipe) > 0:
-        dataloader_params = {
-            "batch_size": args.train_batch_size,
-            "resolution": args.resolution,
-        }
-        from media_pipe_imgdir import MediaApiDataLoader
-
-        train_dataloader = MediaApiDataLoader(train_dataset, **dataloader_params)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        accelerator.init_trackers("text2image-fine-tune-sdxl", config=vars(args))
-
-    def unwrap_model(model, training=False):
-        model = accelerator.unwrap_model(model)
-        model = model._orig_mod if is_compiled_module(model) else model
-        if not training:
-            return model
-        else:
-            if accelerator.distributed_type == GaudiDistributedType.MULTI_HPU:
-                kwargs = {}
-                kwargs["gradient_as_bucket_view"] = True
-                accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
-            if args.use_hpu_graphs_for_training:
-                htcore.hpu.ModuleCacher()(model=model, inplace=True)
-
-    unwrap_model(model=unet, training=True)
-    hb_profiler = HabanaProfile(
-        warmup=args.profiling_warmup_steps,
-        active=args.profiling_steps,
-        record_shapes=False,
-    )
-    # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    global_step = 0
-    first_epoch = 0
-
-    # Potentially load in the weights and states from a previous save
-    if args.resume_from_checkpoint:
-        if args.resume_from_checkpoint != "latest":
-            path = os.path.basename(args.resume_from_checkpoint)
-        else:
-            # Get the most recent checkpoint
-            dirs = os.listdir(args.output_dir)
-            dirs = [d for d in dirs if d.startswith("checkpoint")]
-            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1] if len(dirs) > 0 else None
-
-        if path is None:
-            accelerator.print(
-                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
-            )
-            args.resume_from_checkpoint = None
-            initial_global_step = 0
-        else:
-            accelerator.print(f"Resuming from checkpoint {path}")
-            accelerator.load_state(os.path.join(args.output_dir, path))
-            global_step = int(path.split("-")[1])
-
-            initial_global_step = global_step
-            first_epoch = global_step // num_update_steps_per_epoch
-
-    else:
-        initial_global_step = 0
-
-    progress_bar = tqdm(
-        range(0, args.max_train_steps),
-        initial=initial_global_step,
-        desc="Steps",
-        # Only show the progress bar once on each machine.
-        disable=not accelerator.is_local_main_process,
-    )
-
-    t0 = None
-    t_start = time.perf_counter()
-    train_loss = torch.tensor(0, dtype=torch.float, device="hpu")
-    checkpoint_time = 0
-    for epoch in range(first_epoch, args.num_train_epochs):
-        train_loss.zero_()
-        if hb_profiler:
-            hb_profiler.start()
-        for step, batch in enumerate(train_dataloader):
-            if t0 is None and global_step == args.throughput_warmup_steps:
-                t0 = time.perf_counter()
-            with accelerator.accumulate(unet):
-                # Move compute_vae_encoding here to reflect the transformed image input
-                model_input = compute_vae_encodings(batch["pixel_values"], vae)
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(model_input)
-                if args.noise_offset:
-                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                    rand_device = model_input.device
-                    noise += args.noise_offset * torch.randn(
-                        (model_input.shape[0], model_input.shape[1], 1, 1),
-                        device=rand_device,
-                    )
-                noise = noise.to(model_input.device)
-
-                bsz = model_input.shape[0]
-
-                if args.timestep_bias_strategy == "none":
-                    # Sample a random timestep for each image without bias.
-                    timesteps = torch.randint(
-                        0,
-                        noise_scheduler.config.num_train_timesteps,
-                        (bsz,),
-                        device=model_input.device,
-                    )
-                    timesteps = timesteps.long()
-                else:
-                    # Sample a random timestep for each image, potentially biased by the timestep weights.
-                    # Biasing the timestep weights allows us to spend less time training irrelevant timesteps.
-                    weights = generate_timestep_weights(args, noise_scheduler.config.num_train_timesteps).to(
-                        model_input.device
-                    )
-                    timesteps = torch.multinomial(weights, bsz, replacement=True).long()
-
-                # Add noise to the model input according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
-
-                # time ids
-                def compute_time_ids(original_size, crops_coords_top_left):
-                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
-                    target_size = (args.resolution, args.resolution)
-                    if "torch.Tensor" in str(type(original_size)):
-                        add_time_ids = torch.cat(
-                            [
-                                original_size,
-                                crops_coords_top_left,
-                                torch.tensor(target_size, device=crops_coords_top_left.device),
-                            ]
-                        )
-                    else:
-                        add_time_ids = list(original_size + crops_coords_top_left + target_size)
-                        add_time_ids = torch.tensor([add_time_ids])
-                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
-                    return add_time_ids
-
-                add_time_ids = torch.cat(
-                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
-                )
-                # Predict the noise residual
-                unet_added_conditions = {"time_ids": add_time_ids}
-                prompt_embeds = batch["prompt_embeds"].to(accelerator.device)
-                pooled_prompt_embeds = batch["pooled_prompt_embeds"].to(accelerator.device)
-                unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
-
-                model_pred = unet(
-                    noisy_model_input,
-                    timesteps,
-                    prompt_embeds,
-                    added_cond_kwargs=unet_added_conditions,
-                    return_dict=False,
-                )[0]
-
-                # Get the target for loss depending on the prediction type
-                if args.prediction_type is not None:
-                    # set prediction_type of scheduler if defined
-                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
-
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
-                elif noise_scheduler.config.prediction_type == "sample":
-                    # We set the target to latents here, but the model_pred will return the noise sample prediction.
-                    target = model_input
-                    # We will have to subtract the noise residual from the prediction to get the target sample.
-                    model_pred = model_pred - noise
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                if args.snr_gamma is None:
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-                else:
-                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
-                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
-                    # This is discussed in Section 4.2 of the same paper.
-                    snr = compute_snr(noise_scheduler, timesteps)
-                    if noise_scheduler.config.prediction_type == "v_prediction":
-                        # Velocity objective requires that we add one to SNR values before we divide by them.
-                        snr = snr + 1
-                    mse_loss_weights = (
-                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
-                    )
-
-                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
-                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
-                    loss = loss.mean()
-
-                # Gather the losses across all processes for logging (if we use distributed training).
-                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
-                train_loss += avg_loss / args.gradient_accumulation_steps
-                # Backpropagate
-                # TODO: check why this cause bufferoverflow issue
-                # with torch.autocast(device_type="hpu", dtype=weight_dtype, enabled=True):
-                accelerator.backward(loss)
-                htcore.mark_step()
-
-                if accelerator.sync_gradients:
-                    params_to_clip = unet.parameters()
-                    if gaudi_config.use_fused_clip_norm:
-                        fused_clip_norm.clip_norm(params_to_clip)
-                    else:
-                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad(set_to_none=True)
-                htcore.mark_step()
-                hb_profiler.step()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-                if accelerator.is_main_process:
-                    if args.checkpointing_steps is not None and global_step % args.checkpointing_steps == 0:
-                        t_chkpt_start = time.perf_counter()
-                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
-                        if args.checkpoints_total_limit is not None:
-                            checkpoints = os.listdir(args.output_dir)
-                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
-                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
-
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
-                            if len(checkpoints) >= args.checkpoints_total_limit:
-                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
-                                removing_checkpoints = checkpoints[0:num_to_remove]
-
-                                logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
-                                )
-                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
-
-                                for removing_checkpoint in removing_checkpoints:
-                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
-                                    shutil.rmtree(removing_checkpoint)
-
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
-                        logger.info(f"Saved state to {save_path}")
-                        t_chkpt_end = time.perf_counter()
-                        checkpoint_time += t_chkpt_end - t_chkpt_start
-
-                if (global_step - 1) % args.logging_step == 0 or global_step == args.max_train_steps:
-                    train_loss_scalar = train_loss.item()
-                    accelerator.log({"train_loss": train_loss_scalar}, step=global_step)
-
-                    if args.gradient_accumulation_steps > 1:
-                        logs = {
-                            "step_loss": loss.item(),
-                            "lr": lr_scheduler.get_last_lr()[0],
-                            "mem_used": to_gb_rounded(htorch.hpu.memory_allocated()),
-                        }
-                    else:
-                        logs = {
-                            "step_loss": train_loss_scalar,
-                            "lr": lr_scheduler.get_last_lr()[0],
-                            "mem_used": to_gb_rounded(htorch.hpu.memory_allocated()),
-                        }
-                    progress_bar.set_postfix(**logs)
-                train_loss.zero_()
-
-            if global_step >= args.max_train_steps:
-                break
-
-        hb_profiler.stop()
-        if accelerator.is_main_process:
-            if args.validation_prompt is not None and (epoch + 1) % args.validation_epochs == 0:
-                logger.info(
-                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}."
-                )
-                if args.use_ema:
-                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
-                    ema_unet.store(unet.parameters())
-                    ema_unet.copy_to(unet.parameters())
-
-                # create pipeline
-                vae = AutoencoderKL.from_pretrained(
-                    vae_path,
-                    subfolder=("vae" if args.pretrained_vae_model_name_or_path is None else None),
-                    revision=args.revision,
-                    variant=args.variant,
-                )
-                pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
-                    args.pretrained_model_name_or_path,
-                    vae=vae,
-                    unet=unwrap_model(unet),
-                    revision=args.revision,
-                    variant=args.variant,
-                    use_habana=True,
-                    use_hpu_graphs=args.use_hpu_graphs_for_inference,
-                    gaudi_config=args.gaudi_config_name,
-                )
-                if args.prediction_type is not None:
-                    scheduler_args = {"prediction_type": args.prediction_type}
-                    pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
-                pipeline.scheduler = GaudiEulerDiscreteScheduler.from_config(pipeline.scheduler.config)
-                pipeline = pipeline.to(accelerator.device)
-                pipeline.set_progress_bar_config(disable=True)
-
-                # run inference
-                generator = torch.Generator(device="cpu").manual_seed(args.seed) if args.seed else None
-                pipeline_args = {"prompt": args.validation_prompt}
-
-                with torch.autocast(
-                    device_type="hpu",
-                    dtype=torch.bfloat16,
-                    enabled=gaudi_config.use_torch_autocast,
-                ):
-                    images = [
-                        pipeline(**pipeline_args, generator=generator, num_inference_steps=25).images[0]
-                        for _ in range(args.num_validation_images)
-                    ]
-
-                for tracker in accelerator.trackers:
-                    if tracker.name == "tensorboard":
-                        np_images = np.stack([np.asarray(img) for img in images])
-                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
-                    if tracker.name == "wandb":
-                        tracker.log(
-                            {
-                                "validation": [
-                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                                    for i, image in enumerate(images)
-                                ]
-                            }
-                        )
-
-                del pipeline
-
-    if t0 is not None:
-        duration = time.perf_counter() - t0 - (checkpoint_time if args.adjust_throughput else 0)
-        ttt = time.perf_counter() - t_start
-        throughput = (args.max_train_steps - args.throughput_warmup_steps) * total_batch_size / duration
-
-        accelerator.wait_for_everyone()
-        if accelerator.is_main_process:
-            logger.info(f"Throughput = {throughput} samples/s")
-            logger.info(f"Train runtime = {duration} seconds")
-            logger.info(f"Total Train runtime = {ttt} seconds")
-            metrics = {
-                "train_samples_per_second": throughput,
-                "train_runtime": duration,
-            }
-            with open(f"{args.output_dir}/speed_metrics.json", mode="w") as file:
-                json.dump(metrics, file)
-
-        unet = accelerator.unwrap_model(unet)
-        if args.use_ema:
-            ema_unet.copy_to(unet.parameters())
-
-        # Serialize pipeline.
-        vae = AutoencoderKL.from_pretrained(
-            vae_path,
-            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
-            revision=args.revision,
-            variant=args.variant,
-            torch_dtype=weight_dtype,
-        )
-        pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=unet,
-            vae=vae,
-            revision=args.revision,
-            variant=args.variant,
-            torch_dtype=weight_dtype,
-            scheduler=noise_scheduler,
-            use_habana=True,
-            use_hpu_graphs=args.use_hpu_graphs_for_inference,
-            gaudi_config=args.gaudi_config_name,
-        )
-        if args.prediction_type is not None:
-            scheduler_args = {"prediction_type": args.prediction_type}
-            pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
-        pipeline.save_pretrained(args.output_dir)
-
-        # run inference
-        images = []
-        if args.validation_prompt and args.num_validation_images > 0:
-            pipeline = pipeline.to(accelerator.device)
-            generator = torch.Generator(device="cpu").manual_seed(args.seed) if args.seed else None
-            with torch.autocast(
-                device_type="hpu",
-                dtype=weight_dtype,
-                enabled=gaudi_config.use_torch_autocast,
-            ):
-                images = [
-                    pipeline(
-                        args.validation_prompt,
-                        num_inference_steps=25,
-                        generator=generator,
-                    ).images[0]
-                    for _ in range(args.num_validation_images)
-                ]
-            # Save images in the specified directory if not None and if they are in PIL format
-            if args.image_save_dir is not None:
-                if args.output_type == "pil":
-                    image_save_dir = Path(args.image_save_dir)
-                    image_save_dir.mkdir(parents=True, exist_ok=True)
-                    logger.info(f"Saving images in {image_save_dir.resolve()}...")
-                    for i, image in enumerate(images):
-                        image.save(image_save_dir / f"image_{epoch}_{i+1}.png")
-                else:
-                    logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")
-
-            for tracker in accelerator.trackers:
-                if tracker.name == "tensorboard":
-                    np_images = np.stack([np.asarray(img) for img in images])
-                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
-                if tracker.name == "wandb":
-                    tracker.log(
-                        {
-                            "test": [
-                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
-                                for i, image in enumerate(images)
-                            ]
-                        }
-                    )
-
-        if args.push_to_hub:
-            save_model_card(
-                repo_id=repo_id,
-                images=images,
-                validation_prompt=args.validation_prompt,
-                base_model=args.pretrained_model_name_or_path,
-                dataset_name=args.dataset_name,
-                repo_folder=args.output_dir,
-                vae_path=args.pretrained_vae_model_name_or_path,
-            )
-            upload_folder(
-                repo_id=repo_id,
-                folder_path=args.output_dir,
-                commit_message="End of training",
-                ignore_patterns=["step_*", "epoch_*"],
-            )
-
-    accelerator.end_training()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/summarization/README.md b/examples/summarization/README.md
deleted file mode 100644
index e1fc98c4dd..0000000000
--- a/examples/summarization/README.md
+++ /dev/null
@@ -1,259 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Summarization Examples
-
-This directory contains examples for finetuning and evaluating transformers on summarization tasks.
-
-`run_summarization.py` is a lightweight example of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (jsonlines or csv), then fine-tune and evaluate T5 (or predict using BART) on it.
-
-For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files.
-You will also find examples of these below.
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Single-card Training
-
-Here is an example of a summarization task with T5:
-
-```bash
-python run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --save_strategy epoch \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-Only T5 models `t5-small`, `t5-base`, `t5-large`, `t5-3b` and `t5-11b` must use an additional argument: `--source_prefix "summarize: "`.
-
-We used CNN/DailyMail dataset in this example as `t5-small` was trained on it and one can get good scores even when pre-training with a very small sample.
-
-Extreme Summarization (XSum) Dataset is another commonly used dataset for the task of summarization. To use it replace `--dataset_name cnn_dailymail --dataset_config "3.0.0"` with  `--dataset_name xsum`.
-
-And here is how you would use it on your own files, after adjusting the values for the arguments
-`--train_file`, `--validation_file`, `--text_column` and `--summary_column` to match your setup:
-
-```bash
-python run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --train_file path_to_csv_or_jsonlines_file \
-    --validation_file path_to_csv_or_jsonlines_file \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --overwrite_output_dir \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-The task of summarization also supports custom CSV and JSONLINES formats.
-
-### Custom CSV Files
-
-If it's a csv file the training and validation files should have a column for the inputs texts and a column for the summaries.
-
-If the csv file has just two columns as in the following example:
-
-```csv
-text,summary
-"I'm sitting here in a boring room. It's just another rainy Sunday afternoon. I'm wasting my time I got nothing to do. I'm hanging around I'm waiting for you. But nothing ever happens. And I wonder","I'm sitting in a room where I'm waiting for something to happen"
-"I see trees so green, red roses too. I see them bloom for me and you. And I think to myself what a wonderful world. I see skies so blue and clouds so white. The bright blessed day, the dark sacred night. And I think to myself what a wonderful world.","I'm a gardener and I'm a big fan of flowers."
-"Christmas time is here. Happiness and cheer. Fun for all that children call. Their favorite time of the year. Snowflakes in the air. Carols everywhere. Olden times and ancient rhymes. Of love and dreams to share","It's that time of year again."
-```
-
-The first column is assumed to be for `text` and the second is for the summary.
-
-If the csv file has multiple columns, you can then specify the names of the columns to use:
-
-```bash
-    --text_column text_column_name \
-    --summary_column summary_column_name \
-```
-
-For example, if the columns were:
-
-```csv
-id,date,text,summary
-```
-
-and you wanted to select only `text` and `summary`, then you'd pass these additional arguments:
-
-```bash
-    --text_column text \
-    --summary_column summary \
-```
-
-### Custom JSONLINES Files
-
-The second supported format is jsonlines. Here is an example of a jsonlines custom data file.
-
-
-```json
-{"text": "I'm sitting here in a boring room. It's just another rainy Sunday afternoon. I'm wasting my time I got nothing to do. I'm hanging around I'm waiting for you. But nothing ever happens. And I wonder", "summary": "I'm sitting in a room where I'm waiting for something to happen"}
-{"text": "I see trees so green, red roses too. I see them bloom for me and you. And I think to myself what a wonderful world. I see skies so blue and clouds so white. The bright blessed day, the dark sacred night. And I think to myself what a wonderful world.", "summary": "I'm a gardener and I'm a big fan of flowers."}
-{"text": "Christmas time is here. Happiness and cheer. Fun for all that children call. Their favorite time of the year. Snowflakes in the air. Carols everywhere. Olden times and ancient rhymes. Of love and dreams to share", "summary": "It's that time of year again."}
-```
-
-Same as with the CSV files, by default the first value will be used as the text record and the second as the summary record. Therefore you can use any key names for the entries, in this example `text` and `summary` were used.
-
-And as with the CSV files, you can specify which values to select from the file, by explicitly specifying the corresponding key names. In our example this again would be:
-
-```bash
-    --text_column text \
-    --summary_column summary \
-```
-
-
-## Multi-card Training
-
-Here is an example on 8 HPUs:
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config '"3.0.0"' \
-    --source_prefix '"summarize: "' \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --save_strategy epoch \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-
-## Using DeepSpeed
-
-Here is an example on 8 HPUs on Gaudi2 with DeepSpeed-ZeRO3 to fine-tune [FLAN-T5 XXL](https://huggingface.co/google/flan-t5-xxl):
-```bash
-PT_HPU_MAX_COMPOUND_OP_SIZE=512 python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_summarization.py \
-    --model_name_or_path google/flan-t5-xxl \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config '"3.0.0"' \
-    --source_prefix '"summarize: "' \
-    --output_dir ./tst-summarization \
-    --per_device_train_batch_size 22 \
-    --per_device_eval_batch_size 22 \
-    --learning_rate 1e-4 \
-    --num_train_epochs 3 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --generation_max_length 129 \
-    --save_strategy epoch \
-    --throughput_warmup_steps 3 \
-    --gradient_checkpointing \
-    --adam_epsilon 1e-08 --logging_steps 1 \
-    --deepspeed ds_flan_t5_z3_config_bf16.json
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with T5 on the CNN-DailyMail dataset on 1 Gaudi card with the following command:
-```bash
-python run_summarization.py \
-    --model_name_or_path t5-small \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --bf16 \
-    --bf16_full_eval
-```
-
-You can run inference with BART on the CNN-DailyMail dataset on 1 Gaudi card with the following command:
-```bash
-python run_summarization.py \
-    --model_name_or_path facebook/bart-large-cnn \
-    --do_predict \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization \
-    --per_device_eval_batch_size 2 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/bart \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --num_beams 1
-```
diff --git a/examples/summarization/ds_flan_t5_z3_config_bf16.json b/examples/summarization/ds_flan_t5_z3_config_bf16.json
deleted file mode 100755
index 538f0afa5e..0000000000
--- a/examples/summarization/ds_flan_t5_z3_config_bf16.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-  "bf16": {
-    "enabled": true
-  },
-  "optimizer": {
-    "type": "adam",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto",
-      "torch_adam": "torch_impl",
-      "adam_w_mode": false
-    }
-  },
-  "scheduler": {
-    "type": "WarmupLR",
-    "params": {
-      "warmup_min_lr": "auto",
-      "warmup_max_lr": "auto",
-      "warmup_num_steps": "auto"
-    }
-  },
-  "zero_optimization": {
-    "stage": 3,
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 1e9,
-    "reduce_bucket_size": 1666777,
-    "reduce_scatter" : false,
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 1e9,
-    "stage3_max_reuse_distance": 1e9,
-    "stage3_gather_16bit_weights_on_model_save": true
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "steps_per_print": 2000,
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
diff --git a/examples/summarization/requirements.txt b/examples/summarization/requirements.txt
deleted file mode 100644
index 8c09feabf7..0000000000
--- a/examples/summarization/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-datasets >= 2.4.0, <= 2.19.2
-sentencepiece != 0.1.92
-protobuf == 3.20.3
-rouge-score == 0.1.2
-nltk == 3.8.1
-py7zr == 0.21.0
-torch >= 1.3, <= 2.4.0
-evaluate == 0.4.2
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
deleted file mode 100755
index 48bb39cdfa..0000000000
--- a/examples/summarization/run_summarization.py
+++ /dev/null
@@ -1,870 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for sequence to sequence.
-"""
-# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
-
-import copy
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import datasets
-import evaluate
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-import torch
-import transformers
-from datasets import load_dataset
-from filelock import FileLock
-from transformers import (
-    AutoConfig,
-    AutoModelForSeq2SeqLM,
-    AutoTokenizer,
-    DataCollatorForSeq2Seq,
-    HfArgumentParser,
-    MBart50Tokenizer,
-    MBart50TokenizerFast,
-    MBartTokenizer,
-    MBartTokenizerFast,
-    default_data_collator,
-)
-from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainer, GaudiSeq2SeqTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
-
-try:
-    nltk.data.find("tokenizers/punkt")
-except (LookupError, OSError):
-    if is_offline_mode():
-        raise LookupError(
-            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
-        )
-    with FileLock(".lock") as lock:
-        nltk.download("punkt", quiet=True)
-
-# A list of all multilingual tokenizer which require lang attribute.
-MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    resize_position_embeddings: Optional[bool] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
-                "the model's position embeddings."
-            )
-        },
-    )
-    use_cache: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether or not the model should return the last key/values attentions (not used by all models)."
-                "Only relevant if `config.is_decoder=True`."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    text_column: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
-    )
-    summary_column: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
-            )
-        },
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_source_length: Optional[int] = field(
-        default=1024,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_target_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total sequence length for target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    val_max_target_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. "
-                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
-                "during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to model maximum sentence length. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-                "efficient on GPU but very bad for HPU in lazy mode."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    num_beams: Optional[int] = field(
-        default=1,
-        metadata={
-            "help": (
-                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
-                "which is used during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    ignore_pad_token_for_loss: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
-        },
-    )
-    source_prefix: Optional[str] = field(
-        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
-    )
-    source_suffix: Optional[str] = field(default="", metadata={"help": "A suffix to add after every source text."})
-
-    forced_bos_token: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to force as the first generated token after the decoder_start_token_id. "
-                "Useful for multilingual models like mBART where the first generated token"
-                "needs to be the target language token (Usually it is the target language token)"
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if (
-            self.dataset_name is None
-            and self.train_file is None
-            and self.validation_file is None
-            and self.test_file is None
-        ):
-            raise ValueError("Need either a dataset name or a training, validation, or test file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.test_file is not None:
-                extension = self.test_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
-        if self.val_max_target_length is None:
-            self.val_max_target_length = self.max_target_length
-
-
-summarization_name_mapping = {
-    "amazon_reviews_multi": ("review_body", "review_title"),
-    "big_patent": ("description", "abstract"),
-    "cnn_dailymail": ("article", "highlights"),
-    "orange_sum": ("text", "summary"),
-    "pn_summary": ("article", "summary"),
-    "psc": ("extract_text", "summary_text"),
-    "samsum": ("dialogue", "summary"),
-    "thaisum": ("body", "summary"),
-    "xglue": ("news_body", "news_title"),
-    "xsum": ("document", "summary"),
-    "wiki_summary": ("article", "highlights"),
-    "multi_news": ("document", "summary"),
-}
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiSeq2SeqTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_summarization", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    if data_args.source_prefix is None and model_args.model_name_or_path in [
-        "google-t5/t5-small",
-        "google-t5/t5-base",
-        "google-t5/t5-large",
-        "google-t5/t5-3b",
-        "google-t5/t5-11b",
-    ]:
-        logger.warning(
-            "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
-            "`--source_prefix 'summarize: ' `"
-        )
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files this script will use the first column for the full texts and the second column for the
-    # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-        use_cache=False if training_args.gradient_checkpointing else model_args.use_cache,
-        _attn_implementation=training_args.attn_implementation,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    model = AutoModelForSeq2SeqLM.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    is_bart = model.config.model_type == "bart"
-    if is_bart and training_args.do_train:
-        raise ValueError(
-            "Training is not yet supported for BART. Eval or predict can be enabled with `--do_eval` and `--do_predict`."
-        )
-
-    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
-    # on a small vocab and want a smaller embedding size, remove this test.
-    embeddings = model.get_input_embeddings()
-    if is_deepspeed_zero3_enabled():
-        import deepspeed
-
-        with deepspeed.zero.GatheredParameters(embeddings.weight, modifier_rank=None):
-            embedding_size = embeddings.weight.shape[0]
-    else:
-        embedding_size = embeddings.weight.shape[0]
-    if len(tokenizer) > embedding_size:
-        model.resize_token_embeddings(len(tokenizer))
-
-    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        if isinstance(tokenizer, MBartTokenizer):
-            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.lang]
-        else:
-            model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.lang)
-
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-
-    if (
-        hasattr(model.config, "max_position_embeddings")
-        and model.config.max_position_embeddings < data_args.max_source_length
-    ):
-        if model_args.resize_position_embeddings is None:
-            logger.warning(
-                "Increasing the model's number of position embedding vectors from"
-                f" {model.config.max_position_embeddings} to {data_args.max_source_length}."
-            )
-            model.resize_position_embeddings(data_args.max_source_length)
-        elif model_args.resize_position_embeddings:
-            model.resize_position_embeddings(data_args.max_source_length)
-        else:
-            raise ValueError(
-                f"`--max_source_length` is set to {data_args.max_source_length}, but the model only has"
-                f" {model.config.max_position_embeddings} position encodings. Consider either reducing"
-                f" `--max_source_length` to {model.config.max_position_embeddings} or to automatically resize the"
-                " model's position encodings by passing `--resize_position_embeddings`."
-            )
-
-    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
-    suffix = data_args.source_suffix if data_args.source_suffix is not None else ""
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    if training_args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        column_names = raw_datasets["train"].column_names
-    elif training_args.do_eval:
-        if "validation" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        column_names = raw_datasets["validation"].column_names
-    elif training_args.do_predict:
-        if "test" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        column_names = raw_datasets["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-
-    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
-        assert (
-            data_args.lang is not None
-        ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
-
-        tokenizer.src_lang = data_args.lang
-        tokenizer.tgt_lang = data_args.lang
-
-        # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
-        # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
-        forced_bos_token_id = (
-            tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None
-        )
-        model.config.forced_bos_token_id = forced_bos_token_id
-
-    # Get the column names for input/target.
-    dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None)
-    if data_args.text_column is None:
-        text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        text_column = data_args.text_column
-        if text_column not in column_names:
-            raise ValueError(
-                f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if data_args.summary_column is None:
-        summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        summary_column = data_args.summary_column
-        if summary_column not in column_names:
-            raise ValueError(
-                f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    # Temporarily set max_target_length for training.
-    max_target_length = data_args.max_target_length
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
-            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
-        )
-
-    def preprocess_function(examples):
-        # remove pairs where at least one record is None
-
-        inputs, targets = [], []
-        for i in range(len(examples[text_column])):
-            if examples[text_column][i] and examples[summary_column][i]:
-                inputs.append(examples[text_column][i])
-                targets.append(examples[summary_column][i])
-            else:
-                raise ValueError("Found case where either text or summary is missing.")
-
-        inputs = [prefix + inp + suffix for inp in inputs]
-        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
-
-        # Tokenize targets with the `text_target` keyword argument
-        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-        model_inputs["labels"] = labels["input_ids"]
-        return model_inputs
-
-    def preprocess_bucketing_function(examples):
-        # remove pairs where at least one record is None
-
-        inputs, targets = [], []
-        for i in range(len(examples[text_column])):
-            if examples[text_column][i] and examples[summary_column][i]:
-                inputs.append(examples[text_column][i])
-                targets.append(examples[summary_column][i])
-            else:
-                raise ValueError("Found case where either text or summary is missing.")
-
-        inputs = [prefix + inp + suffix for inp in inputs]
-        model_inputs = tokenizer(inputs, return_tensors="pt", padding=True)
-        new_model_inputs = {"input_ids": []}
-        for i in range(len(model_inputs["input_ids"])):
-            cur_len = model_inputs["input_ids"][i].shape[-1]
-            max_length = (cur_len + 128 - 1) // 128 * 128
-            if max_length > data_args.max_source_length:
-                max_length = data_args.max_source_length
-                new_model_inputs["input_ids"].append(model_inputs["input_ids"][i][:max_length])
-            else:
-                new_model_inputs["input_ids"].append(
-                    torch.nn.functional.pad(
-                        model_inputs["input_ids"][i], (0, max_length - cur_len), value=tokenizer.pad_token_id
-                    )
-                )
-        model_inputs = new_model_inputs
-        # Tokenize targets with the `text_target` keyword argument
-        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-        model_inputs["labels"] = labels["input_ids"]
-        return model_inputs
-
-    if training_args.do_train:
-        train_dataset = raw_datasets["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-        with training_args.main_process_first(desc="train dataset map pre-processing"):
-            train_dataset = train_dataset.map(
-                preprocess_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on train dataset",
-            )
-
-    def wrapper_preprocess_function(examples):
-        if model.config.is_encoder_decoder:
-            return preprocess_bucketing_function(examples)
-        else:
-            return preprocess_function(examples)
-
-    if training_args.do_eval:
-        max_target_length = data_args.val_max_target_length
-        eval_dataset = raw_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-        with training_args.main_process_first(desc="validation dataset map pre-processing"):
-            eval_dataset = eval_dataset.map(
-                wrapper_preprocess_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on validation dataset",
-            )
-
-    if training_args.do_predict:
-        max_target_length = data_args.val_max_target_length
-        predict_dataset = raw_datasets["test"]
-        if data_args.max_predict_samples is not None:
-            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
-            predict_dataset = predict_dataset.select(range(max_predict_samples))
-        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
-            predict_dataset = predict_dataset.map(
-                wrapper_preprocess_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on prediction dataset",
-            )
-
-    # Data collator
-    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
-    if data_args.pad_to_max_length:
-        data_collator = default_data_collator
-    else:
-        data_collator = DataCollatorForSeq2Seq(
-            tokenizer,
-            model=model,
-            label_pad_token_id=label_pad_token_id,
-            pad_to_multiple_of=8 if training_args.fp16 else None,
-        )
-
-    # Metric
-    metric = evaluate.load("rouge", cache_dir=model_args.cache_dir)
-
-    def postprocess_text(preds, labels):
-        preds = [pred.strip() for pred in preds]
-        labels = [label.strip() for label in labels]
-
-        # rougeLSum expects newline after each sentence
-        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
-        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
-
-        return preds, labels
-
-    def compute_metrics(eval_preds):
-        preds, labels = eval_preds
-        if isinstance(preds, tuple):
-            preds = preds[0]
-        # Replace -100s used for padding as we can't decode them
-        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
-        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-        # Some simple post-processing
-        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-
-        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
-        result = {k: round(v * 100, 4) for k, v in result.items()}
-        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
-        result["gen_len"] = np.mean(prediction_lens)
-        return result
-
-    # Override the decoding parameters of Seq2SeqTrainer
-    training_args.generation_config = copy.deepcopy(model.generation_config)
-    if training_args.generation_max_length is not None:
-        training_args.generation_config.max_length = training_args.generation_max_length
-    else:
-        training_args.generation_config.max_length = data_args.val_max_target_length
-    if data_args.num_beams is not None:
-        if data_args.num_beams == 1:
-            training_args.generation_config.length_penalty = None
-            training_args.generation_config.early_stopping = False
-        training_args.generation_config.num_beams = data_args.num_beams
-    elif training_args.generation_num_beams is not None:
-        training_args.generation_config.num_beams = training_args.generation_num_beams
-
-    # Initialize our Trainer
-    trainer = GaudiSeq2SeqTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        if isinstance(eval_dataset, dict):
-            metrics = {}
-            for eval_ds_name, eval_ds in eval_dataset.items():
-                dataset_metrics = trainer.evaluate(eval_dataset=eval_ds, metric_key_prefix=f"eval_{eval_ds_name}")
-                metrics.update(dataset_metrics)
-        else:
-            metrics = trainer.evaluate(metric_key_prefix="eval")
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-
-        predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict")
-        metrics = predict_results.metrics
-        max_predict_samples = (
-            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-        if trainer.is_world_process_zero():
-            if training_args.predict_with_generate:
-                predictions = predict_results.predictions
-                predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
-                predictions = tokenizer.batch_decode(
-                    predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
-                )
-                predictions = [pred.strip() for pred in predictions]
-                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
-                with open(output_prediction_file, "w") as writer:
-                    writer.write("\n".join(predictions))
-
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if data_args.lang is not None:
-        kwargs["language"] = data_args.lang
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/text-classification/README.md b/examples/text-classification/README.md
deleted file mode 100644
index f5af6bc7d3..0000000000
--- a/examples/text-classification/README.md
+++ /dev/null
@@ -1,219 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Text Classification Examples
-
-## GLUE tasks
-
-Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py).
-
-Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
-Evaluation](https://gluebenchmark.com/). This script can fine-tune any of the models on the [hub](https://huggingface.co/models)
-and can also be used for a dataset hosted on our [hub](https://huggingface.co/datasets) or your own data in a csv or a JSON file
-(the script might need some tweaks in that case, refer to the comments inside for help).
-
-GLUE is made up of a total of 9 different tasks where the task name can be cola, sst2, mrpc, stsb, qqp, mnli, qnli, rte or wnli.
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Fine-tuning BERT on MRPC
-
-For the following cases, an example of a Gaudi configuration file is given
-[here](https://github.com/huggingface/optimum-habana#how-to-use-it).
-
-
-### Single-card Training
-
-The following example fine-tunes BERT Large (lazy mode) on the `mrpc` dataset hosted on our [hub](https://huggingface.co/datasets):
-
-```bash
-python run_glue.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --task_name mrpc \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 32 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 3 \
-  --max_seq_length 128 \
-  --output_dir ./output/mrpc/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --throughput_warmup_steps 3 \
-  --bf16
-```
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
-### Multi-card Training
-
-Here is how you would fine-tune the BERT large model (with whole word masking) on the text classification MRPC task using the `run_glue` script, with 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_glue.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --task_name mrpc \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 3 \
-    --max_seq_length 128 \
-    --output_dir /tmp/mrpc_output/ \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
-### Using DeepSpeed
-
-Similarly to multi-card training, here is how you would fine-tune the BERT large model (with whole word masking) on the text classification MRPC task using DeepSpeed with 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_glue.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-    --task_name mrpc \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8 \
-    --learning_rate 3e-5 \
-    --num_train_epochs 3 \
-    --max_seq_length 128 \
-    --output_dir /tmp/mrpc_output/ \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with BERT on GLUE on 1 Gaudi card with the following command:
-```bash
-python run_glue.py \
-  --model_name_or_path bert-large-uncased-whole-word-masking \
-  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
-  --task_name mrpc \
-  --do_eval \
-  --max_seq_length 128 \
-  --output_dir ./output/mrpc/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --bf16
-```
-
-## Llama Guard on MRPC
-
-Llama Guard can be used for text classification. The Transformers library will change the head of the model for you during fine-tuning or inference. You can use the same general command as for BERT, except you need to add `--add_pad_token=True` because Llama based models don't have a `pad_token` in their model and tokenizer configuration files. So `--add_pad_token=True` will add a `pad_token` equal to the `eos_token` to the tokenizer and model configurations if it's not defined.
-
-### Fine-tuning with DeepSpeed
-
-Llama Guard can be fine-tuned with DeepSpeed, here is how you would do it on the text classification MRPC task using DeepSpeed with 8 HPUs:
-
-```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_glue.py \
-    --model_name_or_path meta-llama/LlamaGuard-7b \
-    --gaudi_config Habana/llama \
-    --task_name mrpc  \
-    --do_train  \
-    --do_eval \
-    --per_device_train_batch_size 32 \
-    --per_device_eval_batch_size 8  \
-    --learning_rate 3e-5  \
-    --num_train_epochs 3 \
-    --max_seq_length 128 \
-    --add_pad_token True \
-    --output_dir /tmp/mrpc_output/ \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --throughput_warmup_steps 3  \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-
-> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
-
-### Inference
-
-You can run inference with Llama Guard on GLUE on 1 Gaudi card with the following command:
-
-```bash
-python run_glue.py \
-  --model_name_or_path meta-llama/LlamaGuard-7b \
-  --gaudi_config Habana/llama \
-  --task_name mrpc \
-  --do_eval \
-  --per_device_eval_batch_size 64 \
-  --max_seq_length 128 \
-  --add_pad_token True \
-  --pad_to_max_length True \
-  --output_dir ./output/mrpc/ \
-  --use_habana \
-  --use_lazy_mode \
-  --use_hpu_graphs_for_inference \
-  --throughput_warmup_steps 2 \
-  --bf16
-```
diff --git a/examples/text-classification/requirements.txt b/examples/text-classification/requirements.txt
deleted file mode 100644
index 8acefd7006..0000000000
--- a/examples/text-classification/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-datasets >= 2.4.0, <= 2.19.2
-sentencepiece != 0.1.92
-scipy == 1.13.1
-scikit-learn == 1.5.0
-protobuf == 3.20.3
-torch >= 1.3, <= 2.4.0
-evaluate == 0.4.2
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
deleted file mode 100755
index 155d1dd650..0000000000
--- a/examples/text-classification/run_glue.py
+++ /dev/null
@@ -1,663 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning the library models for sequence classification on GLUE."""
-# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
-
-import logging
-import os
-import random
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import datasets
-import evaluate
-import numpy as np
-import transformers
-from datasets import load_dataset
-from transformers import (
-    AutoConfig,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    HfArgumentParser,
-    PretrainedConfig,
-    default_data_collator,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
-
-task_to_keys = {
-    "cola": ("sentence", None),
-    "mnli": ("premise", "hypothesis"),
-    "mrpc": ("sentence1", "sentence2"),
-    "qnli": ("question", "sentence"),
-    "qqp": ("question1", "question2"),
-    "rte": ("sentence1", "sentence2"),
-    "sst2": ("sentence", None),
-    "stsb": ("sentence1", "sentence2"),
-    "wnli": ("sentence1", "sentence2"),
-}
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    task_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
-    )
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    pad_to_max_length: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "A csv or a json file containing the training data."}
-    )
-    validation_file: Optional[str] = field(
-        default=None, metadata={"help": "A csv or a json file containing the validation data."}
-    )
-    problem_type: Optional[str] = field(
-        default="single_label_classification",
-        metadata={"help": "Problem type, such as single_label_classification or multi_label_classification"},
-    )
-    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
-
-    def __post_init__(self):
-        if self.task_name is not None:
-            self.task_name = self.task_name.lower()
-            if self.task_name not in task_to_keys.keys():
-                raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
-        elif self.dataset_name is not None:
-            pass
-        elif self.train_file is None or self.validation_file is None:
-            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
-        else:
-            train_extension = self.train_file.split(".")[-1]
-            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    ignore_mismatched_sizes: bool = field(
-        default=False,
-        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
-    )
-    add_pad_token: bool = field(
-        default=False,
-        metadata={"help": "Will add `pad_token` to tokenizer and model's config as `eos_token` if it's not defined."},
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_glue", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
-    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
-    # label if at least two columns are provided.
-    #
-    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
-    # single column. You can easily tweak this behavior (see below)
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.task_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            "nyu-mll/glue",
-            data_args.task_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    elif data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    else:
-        # Loading a dataset from your local files.
-        # CSV/JSON training and evaluation files are needed.
-        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
-
-        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
-        # when you use `do_predict` without specifying a GLUE benchmark task.
-        if training_args.do_predict:
-            if data_args.test_file is not None:
-                train_extension = data_args.train_file.split(".")[-1]
-                test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
-                data_files["test"] = data_args.test_file
-            else:
-                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
-
-        for key in data_files.keys():
-            logger.info(f"load a local file for {key}: {data_files[key]}")
-
-        if data_args.train_file.endswith(".csv"):
-            # Loading a dataset from local csv files
-            raw_datasets = load_dataset(
-                "csv",
-                data_files=data_files,
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-            )
-        else:
-            # Loading a dataset from local json files
-            raw_datasets = load_dataset(
-                "json",
-                data_files=data_files,
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
-            )
-    # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Labels
-    if data_args.task_name is not None:
-        is_regression = data_args.task_name == "stsb"
-        if not is_regression:
-            label_list = raw_datasets["train"].features["label"].names
-            num_labels = len(label_list)
-        else:
-            num_labels = 1
-    else:
-        # Trying to have good defaults here, don't hesitate to tweak to your needs.
-        is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
-        if is_regression:
-            num_labels = 1
-        else:
-            # A useful fast method:
-            # https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.unique
-            label_list = raw_datasets["train"].unique("label")
-            label_list.sort()  # Let's sort it for determinism
-            num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-        problem_type=data_args.problem_type,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    model = AutoModelForSequenceClassification.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
-    )
-
-    # Preprocessing the raw_datasets
-    if data_args.task_name is not None:
-        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
-    else:
-        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
-        non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
-        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
-            sentence1_key, sentence2_key = "sentence1", "sentence2"
-        else:
-            if len(non_label_column_names) >= 2:
-                sentence1_key, sentence2_key = non_label_column_names[:2]
-            else:
-                sentence1_key, sentence2_key = non_label_column_names[0], None
-
-    # Padding strategy
-    if data_args.pad_to_max_length:
-        padding = "max_length"
-    else:
-        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
-        padding = False
-
-    if model_args.add_pad_token:
-        if not model.config.pad_token_id and not tokenizer.pad_token:
-            tokenizer.pad_token = tokenizer.eos_token
-            model.config.pad_token_id = tokenizer.eos_token_id
-
-    # Some models have set the order of the labels to use, so let's make sure we do use it.
-    label_to_id = None
-    if (
-        model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
-        and data_args.task_name is not None
-        and not is_regression
-    ):
-        # Some have all caps in their config, some don't.
-        label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
-        if sorted(label_name_to_id.keys()) == sorted(label_list):
-            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
-        else:
-            logger.warning(
-                "Your model seems to have been trained with labels, but they don't match the dataset: ",
-                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
-                "\nIgnoring the model labels as a result.",
-            )
-    elif data_args.task_name is None and not is_regression:
-        label_to_id = {v: i for i, v in enumerate(label_list)}
-
-    if label_to_id is not None:
-        model.config.label2id = label_to_id
-        model.config.id2label = {id: label for label, id in config.label2id.items()}
-    elif data_args.task_name is not None and not is_regression:
-        model.config.label2id = {l: i for i, l in enumerate(label_list)}
-        model.config.id2label = {id: label for label, id in config.label2id.items()}
-
-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    def preprocess_function(examples):
-        # Tokenize the texts
-        args = (
-            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
-        )
-        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
-
-        # Map labels to IDs (not necessary for GLUE tasks)
-        if label_to_id is not None and "label" in examples:
-            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
-        return result
-
-    with training_args.main_process_first(desc="dataset map pre-processing"):
-        raw_datasets = raw_datasets.map(
-            preprocess_function,
-            batched=True,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on dataset",
-        )
-    if training_args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = raw_datasets["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
-        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
-        if data_args.max_predict_samples is not None:
-            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
-            predict_dataset = predict_dataset.select(range(max_predict_samples))
-
-    # Log a few random samples from the training set:
-    if training_args.do_train:
-        for index in random.sample(range(len(train_dataset)), 3):
-            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-
-    # Get the metric function
-    if data_args.task_name is not None:
-        metric = evaluate.load("glue", data_args.task_name, cache_dir=model_args.cache_dir)
-    elif is_regression:
-        metric = evaluate.load("mse", cache_dir=model_args.cache_dir)
-    else:
-        metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
-
-    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
-    # predictions and label_ids field) and has to return a dictionary string to float.
-    def compute_metrics(p: EvalPrediction):
-        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
-        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        result = metric.compute(predictions=preds, references=p.label_ids)
-        if len(result) > 1:
-            result["combined_score"] = np.mean(list(result.values())).item()
-        return result
-
-    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
-    # we already did the padding.
-    if data_args.pad_to_max_length:
-        data_collator = default_data_collator
-    elif training_args.fp16:
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
-    else:
-        data_collator = None
-
-    # Initialize our Trainer
-    trainer = GaudiTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        compute_metrics=compute_metrics,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
-        tasks = [data_args.task_name]
-        eval_datasets = [eval_dataset]
-        if data_args.task_name == "mnli":
-            tasks.append("mnli-mm")
-            valid_mm_dataset = raw_datasets["validation_mismatched"]
-            if data_args.max_eval_samples is not None:
-                max_eval_samples = min(len(valid_mm_dataset), data_args.max_eval_samples)
-                valid_mm_dataset = valid_mm_dataset.select(range(max_eval_samples))
-            eval_datasets.append(valid_mm_dataset)
-            combined = {}
-
-        for eval_dataset, task in zip(eval_datasets, tasks):
-            metrics = trainer.evaluate(eval_dataset=eval_dataset)
-
-            max_eval_samples = (
-                data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-            )
-            metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-            if task == "mnli-mm":
-                metrics = {k + "_mm": v for k, v in metrics.items()}
-            if task is not None and "mnli" in task:
-                combined.update(metrics)
-
-            trainer.log_metrics("eval", metrics)
-            trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-
-        # Loop to handle MNLI double evaluation (matched, mis-matched)
-        tasks = [data_args.task_name]
-        predict_datasets = [predict_dataset]
-        if data_args.task_name == "mnli":
-            tasks.append("mnli-mm")
-            predict_datasets.append(raw_datasets["test_mismatched"])
-
-        for predict_dataset, task in zip(predict_datasets, tasks):
-            # Removing the `label` columns because it contains -1 and Trainer won't like that.
-            predict_dataset = predict_dataset.remove_columns("label")
-            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
-            predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
-
-            output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
-            if trainer.is_world_process_zero():
-                with open(output_predict_file, "w") as writer:
-                    logger.info(f"***** Predict results {task} *****")
-                    writer.write("index\tprediction\n")
-                    for index, item in enumerate(predictions):
-                        if is_regression:
-                            writer.write(f"{index}\t{item:3.3f}\n")
-                        else:
-                            item = label_list[item]
-                            writer.write(f"{index}\t{item}\n")
-
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
-    if data_args.task_name is not None:
-        kwargs["language"] = "en"
-        kwargs["dataset_tags"] = "glue"
-        kwargs["dataset_args"] = data_args.task_name
-        kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
deleted file mode 100755
index 6597484f06..0000000000
--- a/examples/text-generation/README.md
+++ /dev/null
@@ -1,620 +0,0 @@
-<!---
-Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Language generation
-
-Conditional text generation on Intel® Gaudi® AI Accelerators. You can find more information about it in [this blog post](https://huggingface.co/blog/habana-gaudi-2-bloom).
-
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-Then, if you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html) (e.g. to use BLOOM/BLOOMZ), you should install DeepSpeed as follows:
-```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-```
-
-
-## Usage
-
-In this section, we present how to benchmark a model on Intel Gaudi AI Accelerators with this script. We also show how to use it to run generation on any dataset from the [Hugging Face Hub](https://huggingface.co/datasets).
-
-To run generation with DeepSpeed-inference, you must launch the script as follows:
-
-```bash
-python ../gaudi_spawn.py --use_deepspeed --world_size number_of_devices run_generation.py ARGS
-```
-
-To run multiple DeepSpeed tasks simultaneously, you can launch them with different `master_port` and [`HABANA_VISIBLE_MODULES`](https://docs.habana.ai/en/latest/PyTorch/PT_Multiple_Tenants_on_HPU/Multiple_Dockers_each_with_Single_Workload.html#running-distributed-workload-inside-the-docker-container), for example:
-
-```bash
-# the following tasks could run simultaneously in a container with 8 HPUs
-HABANA_VISIBLE_MODULES="0,1" python ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py ARGS     # using the default master_port=29500
-HABANA_VISIBLE_MODULES="2,3,4,5" python ../gaudi_spawn.py --use_deepspeed --world_size 4 --master_port 29501 run_generation.py ARGS
-HABANA_VISIBLE_MODULES="6,7" python ../gaudi_spawn.py --use_deepspeed --world_size 2 --master_port 29502 run_generation.py ARGS
-```
-
-Without DeepSpeed-inference, you can run the script with:
-
-```bash
-python run_generation.py ARGS
-```
-
-The list of all possible arguments can be obtained running:
-```bash
-python run_generation.py --help
-```
-
-
-### Single and multiple prompts
-
-If you want to generate a sequence of text from a prompt of your choice, you should use the `--prompt` argument.
-For example:
-```
-python run_generation.py \
---model_name_or_path gpt2 \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---do_sample \
---prompt "Here is my prompt"
-```
-
-If you want to provide several prompts as inputs, here is how to do it:
-```
-python run_generation.py \
---model_name_or_path gpt2 \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---do_sample \
---batch_size 2 \
---prompt "Hello world" "How are you?"
-```
-
-> The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size.
-
-### Run Speculative Sampling on Gaudi
-
-If you want to generate a sequence of text from a prompt of your choice using assisted decoding, you can use the following command as an example:
-
-```
-python run_generation.py \
---model_name_or_path gpt2 \
---assistant_model distilgpt2 \
---batch_size 1 \
---max_new_tokens 100 \
---use_hpu_graphs \
---use_kv_cache \
---num_return_sequences 1 \
---temperature 0 \
---prompt "Alice and Bob"
-```
-
-### Benchmark
-
-The default behaviour of this script (i.e. if no dataset is specified with `--dataset_name`) is to benchmark the given model with a few pre-defined prompts or with the prompt you gave with `--prompt`.
-Here are a few settings you may be interested in:
-- `--max_new_tokens` to specify the number of tokens to generate
-- `--max_input_tokens` to specify the max input tokens to pad and truncate input sequences
-- `--batch_size` to specify the batch size
-- `--bf16` to run generation in bfloat16 precision (or to be specified in your DeepSpeed configuration if using DeepSpeed)
-- `--use_hpu_graphs` to use [HPU graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) to speed up generation
-- `--limit_hpu_graphs` to skip HPU Graph usage for first token to save memory
-- `--use_kv_cache` to use the [key/value cache](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.use_cache) to speed up generation
-- `--do_sample` or `--num_beams` to generate new tokens doing sampling or beam search (greedy search is the default)
-- `--prompt` to benchmark the model on one or several prompts of your choice
-- `--attn_softmax_bf16` to run attention softmax layer in bfloat16 precision provided that the model (such as Llama) supports it
-- `--trim_logits` to calculate logits only for the last token in the first time step provided that the model (such as Llama) supports it
-
-For example, you can reproduce the results presented in [this blog post](https://huggingface.co/blog/habana-gaudi-2-bloom) with the following command:
-```bash
-python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
---model_name_or_path bigscience/bloom \
---batch_size 1 \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100
-```
-
-You can also run Llama2-70B on Gaudi2 with all optimizations enabled using the following command:
-```bash
-python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---max_new_tokens 4096 \
---bf16 \
---use_hpu_graphs \
---use_kv_cache \
---batch_size 52 \
---attn_softmax_bf16 \
---limit_hpu_graphs \
---reuse_cache \
---trim_logits
-```
-
-To run Falcon-7B inference, use the following command:
-```bash
-python run_generation.py \
- --model_name_or_path tiiuae/falcon-7b \
- --bf16 \
- --use_hpu_graphs \
- --use_kv_cache \
- --batch_size 1 \
- --max_new_tokens 128 \
- --do_sample
-```
-
-To run Falcon-40B inference on 8 Gaudi2 cards, use the following command:
-```bash
-python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
---model_name_or_path tiiuae/falcon-40b \
---max_new_tokens 2048 \
---bf16 \
---use_hpu_graphs \
---use_kv_cache \
---batch_size 1 \
---do_sample \
---use_flash_attention \
---flash_attention_causal_mask
-```
-
-> To be able to run gated models like [StarCoder](https://huggingface.co/bigcode/starcoder), you should:
-> - have a HF account
-> - agree to the terms of use of the model in its model card on the HF Hub
-> - set a read token as explained [here](https://huggingface.co/docs/hub/security-tokens)
-> - login to your account using the HF CLI: run `huggingface-cli login` before launching your script
->
-> And then you can run it as any other model:
-> ```
-> python run_generation.py \
-> --model_name_or_path bigcode/starcoder \
-> --batch_size 1 \
-> --use_hpu_graphs \
-> --use_kv_cache \
-> --max_new_tokens 100 \
-> --bf16
-> ```
-
-### Use any dataset from the Hugging Face Hub
-
-You can also provide the name of a dataset from the Hugging Face Hub to perform generation on it with the argument `--dataset_name`.
-
-By default, the first column in the dataset of type `string` will be used as prompts. You can also select the column you want with the argument `--column_name`.
-
-Here is an example with [JulesBelveze/tldr_news](https://huggingface.co/datasets/JulesBelveze/tldr_news):
-```bash
-python run_generation.py \
---model_name_or_path gpt2 \
---batch_size 2 \
---max_new_tokens 100 \
---use_hpu_graphs \
---use_kv_cache \
---dataset_name JulesBelveze/tldr_news \
---column_name content \
---bf16
-```
-
-> The prompt length is limited to 16 tokens. Prompts longer than this will be truncated.
-
-
-### Use PEFT models for generation
-
-You can also provide the path to a PEFT model to perform generation with the argument `--peft_model`.
-
-For example:
-```bash
-python run_generation.py \
---model_name_or_path meta-llama/Llama-2-7b-hf \
---use_hpu_graphs \
---use_kv_cache \
---batch_size 1 \
---bf16 \
---max_new_tokens 100 \
---prompt "Here is my prompt" \
---peft_model goliaro/llama-2-7b-lora-full
-```
-
-
-### Using growing bucket optimization
-
-With `--bucket_size`, instead of padding up the kv-cache up to full size before starting, we grow the cache/input in multiples of `bucket_size`. This helps increase throughput and also reduce number of compilations if the dataset has varying prompt lengths.
-
-> For now, it is available only for greedy and beam search generation, and cannot be used with `--reuse_cache`.
-
-Here is an example:
-```bash
-python run_generation.py \
---model_name_or_path path_to_model    \
---use_hpu_graphs \
---use_kv_cache \
---bf16 \
---max_new_tokens 200 \
---batch_size=2 \
---bucket_size 50
-```
-
-`--bucket_size` option is especially useful when processing an input stream with varying lengths, that is when you have something like `--dataset_name squad --column_name context --max_input_tokens -1`. `--max_input_tokens -1` specifies no truncation of input prompt in the dataset.
-
-Another way to simulate dynamic input is to use `--simulate_dyn_prompt`. For example `--simulate_dyn_prompt 25 35 45` will extend or crop the default prompt (or the prompt passed in using `--prompt`) to sizes 25, 35, and 45, and throughput will be measured for these 3 lengths. If `--simulate_dyn_prompt` is used, the min and max input lengths from it are computed to perform warmup as well. One final optimization that can be used in case of dynamic inputs is `--reduce_recompile`. Thus the suggested configuration to simulate dynamicity after warmup is to use all three arguments: `--simulate_dyn_prompt 25 35 45 --reduce_recompile --bucket_size 30`
-
-While `--bucket_size` works for any model without model file changes, an even more optimized version of bucketing is supported for certain models like Llama. This can be enabled by setting `--bucket_internal` flag (along with `--bucket_size` to specify the bucket size)
-
-
-### Running with torch.compile
-
-torch.compile is an experimental feature. It has not been validated for all models. To enable torch.compile, please
-set the following environment variables before running the command: `PT_ENABLE_INT64_SUPPORT=1` and `PT_HPU_LAZY_MODE=0`.
-
-You will also need to add `--torch_compile` in your command.
-
-### Running with tensor-parallel strategy
-
-> [!NOTE]
-> This strategy includes code from the [foundation-model-stack](https://github.com/foundation-model-stack/foundation-model-stack) repository, which is licensed under the Apache License 2.0. See the `LICENSE` file for more details.
-
-> [!WARNING]
-> torch.compile with tensor parallel strategy is an experimental feature. It has not been validated for all models.
-
-To enable torch.compile with tensor parallel strategy, please set the following environment variables before running the
-command: `PT_ENABLE_INT64_SUPPORT=1` and `PT_HPU_LAZY_MODE=0`. This will enable tensor parallel strategy without deepspeed.
-
-You will also need to add `--torch_compile` and `--parallel_strategy="tp"` in your command.
-
-Here is an example:
-```bash
-PT_ENABLE_INT64_SUPPORT=1 PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py  --world_size 8 run_generation.py \
---model_name_or_path meta-llama/Llama-2-70b-hf  \
---trim_logits \
---use_kv_cache \
---attn_softmax_bf16 \
---bf16 \
---bucket_internal  \
---bucket_size=128  \
---use_flash_attention \
---flash_attention_recompute \
---batch_size 246 \
---max_input_tokens 2048 \
---max_new_tokens 2048 \
---torch_compile \
---parallel_strategy="tp"
-```
-
-### Running with FP8
-
-Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B and phi-2 in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch.
-
-More information on enabling fp8 in SynapseAI is available here:
-https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
-
-Here is an example to measure the tensor quantization statistics on LLama2-70b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure.json python ../gaudi_spawn.py \
---use_deepspeed --world_size 8 run_lm_eval.py \
--o acc_70b_bs1_measure.txt \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---attn_softmax_bf16 \
---use_hpu_graphs \
---trim_logits \
---use_kv_cache \
---bucket_size=128 \
---bucket_internal \
---use_flash_attention \
---flash_attention_recompute \
---bf16 \
---batch_size 1
-```
-
-Here is an example to quantize the model based on previous measurements for LLama2-70b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
---use_deepspeed --world_size 8 run_lm_eval.py \
--o acc_70b_bs1_quant.txt \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---attn_softmax_bf16 \
---use_hpu_graphs \
---trim_logits \
---use_kv_cache \
---bucket_size=128 \
---bucket_internal \
---use_flash_attention \
---flash_attention_recompute \
---bf16 \
---batch_size 1
-```
-
-Alternatively, here is another example to quantize the model based on previous measurements for LLama2-70b:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
---use_deepspeed --world_size 8 run_generation.py \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---attn_softmax_bf16 \
---use_hpu_graphs \
---trim_logits \
---use_kv_cache \
---reuse_cache \
---use_flash_attention \
---flash_attention_recompute \
---bf16 \
---batch_size 350 \
---max_new_tokens 2048 \
---max_input_tokens 2048 \
---limit_hpu_graphs
-```
-
-Here is an example to measure the tensor quantization statistics on Mixtral-8x7B with 1 card:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_generation.py \
---model_name_or_path mistralai/Mixtral-8x7B-v0.1 \
---use_hpu_graphs \
---use_kv_cache \
---limit_hpu_graphs \
---bucket_size 128 \
---max_new_tokens 128 \
---batch_size 1 \
---bf16
-```
-
-Here is an example to quantize the model based on previous measurements for Mixtral-8x7B with 1 card:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant_mixtral.json python run_generation.py \
---model_name_or_path mistralai/Mixtral-8x7B-v0.1 \
---use_hpu_graphs \
---use_kv_cache \
---limit_hpu_graphs \
---bucket_size 128 \
---max_new_tokens 2048 \
---batch_size 16 \
---bf16
-```
-
-Here is an example to measure the tensor quantization statistics on Falcon-180B with 8 cards:
-> Please note that Falcon-180B is a gated model, and users are required to request access to it. Please refer to the instructions provided in the StarCoder example above.
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure_include_outputs.json python ../gaudi_spawn.py \
---use_deepspeed --world_size 8 run_lm_eval.py \
--o acc_falcon180b_bs1_quant.txt \
---model_name_or_path tiiuae/falcon-180B \
---use_hpu_graphs \
---use_kv_cache \
---trim_logits \
---batch_size 1 \
---bf16 \
---reuse_cache \
---use_flash_attention \
---flash_attention_recompute \
---flash_attention_causal_mask
-```
-
-Here is an example to quantize the model based on previous measurements for Falcon-180B with 8 cards:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
---use_deepspeed --world_size 8 run_generation.py \
---model_name_or_path tiiuae/falcon-180B \
---use_hpu_graphs \
---use_kv_cache \
---limit_hpu_graphs \
---max_input_tokens 128 \
---max_new_tokens 2048 \
---batch_size 110 \
---bf16 \
---reuse_cache \
---trim_logits \
---use_flash_attention \
---flash_attention_recompute \
---flash_attention_causal_mask
-```
-
-Here is an example to measure the tensor quantization statistics on phi-2 with 1 card:
-
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_lm_eval.py \
--o acc_phi-2_bs1_measure.txt  \
---model_name_or_path microsoft/phi-2 \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---batch_size 1 \
---trim_logits \
---reuse_cache \
---bf16
-```
-
-Here is an example to quantize the model based on previous measurements for phi-2 with 1 card:
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant_phi.json python run_generation.py \
---model_name_or_path microsoft/phi-2 \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---batch_size 1 \
---bf16 \
---trim_logits \
---reuse_cache
-```
-
-
-### Running FP8 models on single device
-
-Some bf16 models don't fit on one card due to hpu memory limitation, but in fp8 precision they do fit.
-As measurement is being calculated in bf16 precision, to be able to run fp8 model on single card you should use `unify_measurements` script.
-Here are the steps:
-1. Measure the model on a number of cards that are enough for the model to fit in BF16.
-2. Quantize the model on the same amount of cards for scales to be saved.
-3. Run unify_measurements.py script using the measurement files created after running steps 1 and 2. A unified measurement is then calculated.
-```bash
-python quantization_tools/unify_measurements.py -g 01234567 -m *path_to_8x_measurements* -o *path_to_output_1x_measurement*
-```
-In the above example, the measurements of cards 0-7 will be unified to a single measurement. For example, if you specify `-g 0123 4567`,
-cards 0-3 and cards 4-7 will be unified in two different measurement files. All different group combinations are supported.
-4. Run quantization using the unified measurement file/s.
-
-More information on usage of the unifier script can be found in fp8 Habana docs: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
-
-
-
-### CPU memory reduction on single card
-
-Some models can fit on HPU DRAM but can't fit on the CPU RAM.
-When we run a model on single card and don't use deepspeed, the `--disk_offload` flag allows to offload weights to disk during model quantization in HQT. When this flag is mentioned, during the quantization process, each weight first is loaded from disk to CPU RAM, when brought to HPU DRAM and quantized there. This way not all the model is on the CPU RAM but only one weight each time.
-To enable this weights offload mechanism, add `--disk_offload` flag to the topology command line.
-Here is an example of using disk_offload in quantize command.
-Please follow the "Running FP8 models on single device" section first before running the cmd below.
-
-```bash
-QUANT_CONFIG=./quantization_config/maxabs_quant.json TQDM_DISABLE=1 \
-python run_generation.py \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---attn_softmax_bf16 \
---use_hpu_graphs \
---trim_logits \
---use_kv_cache \
---limit_hpu_graphs \
---bucket_size=128 \
---bucket_internal \
---max_new_tokens 2048 \
---max_input_tokens 2048 \
---bf16 \
---batch_size 1 \
---disk_offload \
---use_flash_attention \
---flash_attention_recompute
-```
-
-### Running FP8 models on single device
-
-Some bf16 models don't fit on one card due to hpu memory limitation, but in fp8 precision they do fit.
-As measurement is being calculated in bf16 precision, to be able to run fp8 model on single card you should use `unify_measurements` script.
-More information on usage of the unifier script can be found in fp8 Habana docs: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
-
-### CPU memory reduction on single card
-
-Some models can fit on HPU DRAM but can't fit on the CPU RAM.
-When we run a model on single card and don't use deepspeed the `--disk_offload` flag allows to offload weights to disk during model quantization in HQT. When this flag is mentioned, during the quantization process, each weight first is loaded from disk to CPU RAM, when brought to HPU DRAM and quantized there. This way not all the model is on the CPU RAM but only one weight each time.
-To enable this weights offload mechanism, add `--disk_offload` flag to the topology command line.
-
-### Using Habana Flash Attention
-
-Habana Flash Attention addresses large sequence lengths on prompt stage of inference. Using causal attention mask on prompt stage requires input sequences in batch to be of the same length, but can provide a memory saving, thus enabling higher batch sizes.
-
-Below example uses `flash_attention_recompute` mode in order to reduce memory consumption on prompt stage. Additionally since all sequences in a batch are of the same length it uses `flash_attention_causal_mask` which will further improve performance by taking advantage of specific lower-diagonal shape of inputs to softmax operation.
-
-```bash
-python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---use_hpu_graphs \
---limit_hpu_graphs \
---use_kv_cache \
---bf16 \
---trim_logits \
---attn_softmax_bf16 \
---bucket_size=128 \
---bucket_internal \
---batch_size 10 \
---max_input_tokens 40960 \
---max_new_tokens 5120 \
---use_flash_attention \
---flash_attention_recompute \
---flash_attention_causal_mask \
---book_source
-```
-
-For more details see [documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#using-fused-sdpa).
-
-### Running with UINT4 weight quantization using AutoGPTQ
-
-
-Llama2-7b in UINT4 weight only quantization is enabled using [AutoGPTQ Fork](https://github.com/HabanaAI/AutoGPTQ), which provides quantization capabilities in PyTorch.
-Currently, the support is for UINT4 inference of pre-quantized models only.
-
-You can run a *UINT4 weight quantized* model using AutoGPTQ by setting the following environment variables:
-`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=true` before running the command,
-and by adding the argument `--gptq`.
-
-***Note:***
-Setting the above environment variables improves performance. These variables will be removed in future releases.
- 
-
-Here is an example to run a quantized model on Llama2-7b `TheBloke/Llama-2-7b-Chat-GPTQ`:
-```bash
-SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false \
-ENABLE_EXPERIMENTAL_FLAGS=true python run_generation.py \
---attn_softmax_bf16 \
---model_name_or_path TheBloke/Llama-2-7b-Chat-GPTQ \
---use_hpu_graphs \
---limit_hpu_graphs \
---use_kv_cache \
---bucket_size 128 \
---bucket_internal \
---trim_logits \
---max_new_tokens 128 \
---batch_size 1 \
---bf16 \
---gptq
-```
-
-## Language Model Evaluation Harness
-
-The evaluation of LLMs can be done using the `lm_eval.py` script. It utilizes the [LM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness)
- framework and provides the possibility to run one of four tasks: HellaSwag, Lambada_openai, PiQA, WinoGrande.
-
-For a more detailed description of parameters, please see the help message:
-```
-python run_lm_eval.py --help
-```
-
-
-### LM Eval Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements_lm_eval.txt
-```
-
-
-### Examples
-
-Evaluate Llama 7B on Gaudi on task PiQA, using the BF16 data type:
-```
-python run_lm_eval.py \
---model_name_or_path meta-llama/Llama-2-7b-hf \
---use_hpu_graphs \
---use_kv_cache \
---bf16 \
---batch_size=1 \
---tasks piqa \
--o eval.json
-```
-
-Evaluate Llama 70B on 8 Gaudi2 cards on task WinoGrande, using the BF16 data type:
-```
-deepspeed --num_gpus 8 run_lm_eval.py \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---use_hpu_graphs \
---use_kv_cache \
---bf16 \
---batch_size=1 \
---tasks winogrande \
--o eval.json
-```
-
-
-## Text-Generation Pipeline
-
-A Transformers-like pipeline is defined and provided [here](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation/text-generation-pipeline). It is optimized for Gaudi and can be called to generate text in your scripts.
diff --git a/examples/text-generation/evaluation.py b/examples/text-generation/evaluation.py
deleted file mode 100644
index 81959c160d..0000000000
--- a/examples/text-generation/evaluation.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import argparse
-import json
-
-import evaluate
-import nltk
-import numpy as np
-from transformers import AutoTokenizer
-
-
-###################### Habana internal code ##################################
-ACC_TARGET = {"rouge1": 44.4312, "rouge2": 22.0352, "rougeL": 28.6162}
-
-# See https://github.com/mlcommons/inference/pull/1583
-##############################################################################
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint-path",
-        default="/mnt/weka/data/pytorch/llama2/Llama-2-70b-chat-hf",
-        help="Path to Llama2-70b-hf-chat checkpoint",
-    )
-    parser.add_argument("--accuracy-file", default="output/accuracy.json", help="path to accuracy.json")
-    parser.add_argument(
-        "--dataset-file",
-        default="/mnt/weka/data/mlperf_inference/llama2/processed-data.pkl",
-        help="path to processed openorca validation set",
-    )
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument(
-        "--dtype", default="int64", help="dtype of the accuracy log", choices=["int32", "int64", "float"]
-    )
-    args = parser.parse_args()
-    return args
-
-
-def get_groundtruth(processed_dataset_file):
-    import pandas as pd
-
-    data = pd.read_pickle(processed_dataset_file)
-    ground_truths = data["output"]
-    return ground_truths
-
-
-def postprocess_text(preds, targets):
-    preds = [pred.strip() for pred in preds]
-    targets = [target.strip() for target in targets]
-
-    # rougeLSum expects newline after each sentence
-    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
-    targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
-
-    return preds, targets
-
-
-def main():
-    args = get_args()
-    checkpoint_path = args.checkpoint_path
-    metric = evaluate.load("rouge")
-    nltk.download("punkt")
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        checkpoint_path,
-        model_max_length=2048,
-        padding_side="left",
-        use_fast=False,
-    )
-
-    targets = get_groundtruth(args.dataset_file)
-
-    target_required = []
-    preds_token_ids = []
-
-    eval_dtype = np.int64
-    if args.dtype == "int32":
-        eval_dtype = np.int32
-    elif args.dtype == "float":
-        eval_dtype = np.float32
-
-    with open(args.accuracy_file, "r") as f:
-        results = json.load(f)
-
-    seen = set()
-    gen_tok_len = 0
-    for pred in results:
-        qsl_idx = pred["qsl_idx"]
-        if qsl_idx in seen:
-            continue
-
-        seen.add(qsl_idx)
-        target = targets[qsl_idx]
-        target_required.append(target)
-        pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
-
-        gen_tok_len += len(pred)
-        preds_token_ids.append(pred)
-
-    preds_decoded_text = tokenizer.batch_decode(preds_token_ids, skip_special_tokens=True)
-
-    preds, targets = postprocess_text(preds_decoded_text, target_required)
-
-    result = metric.compute(predictions=preds, references=targets, use_stemmer=True, use_aggregator=False)
-    result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
-    prediction_lens = [len(pred) for pred in preds]
-    gen_num = len(preds)
-
-    acc = [result[key] / ACC_TARGET[key] for key in ACC_TARGET]
-    acc = round(np.min(acc) * 100, 2)
-
-    result = {
-        **result,
-        "gen_len": np.sum(prediction_lens),
-        "gen_num": gen_num,
-        "accuracy": acc,  # this is Habana internal field
-    }
-
-    print("\nResults\n")
-    print(result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/text-generation/quantization_config/act_maxabs_pow2_weights_pcs_opt_pow2_quant.json b/examples/text-generation/quantization_config/act_maxabs_pow2_weights_pcs_opt_pow2_quant.json
deleted file mode 100644
index c7c2bd9621..0000000000
--- a/examples/text-generation/quantization_config/act_maxabs_pow2_weights_pcs_opt_pow2_quant.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2",
-    "dump_stats_path": "./hqt_output/measure"
-}
diff --git a/examples/text-generation/quantization_config/maxabs_measure.json b/examples/text-generation/quantization_config/maxabs_measure.json
deleted file mode 100644
index f586a79c88..0000000000
--- a/examples/text-generation/quantization_config/maxabs_measure.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "MEASURE",
-    "observer": "maxabs",
-    "dump_stats_path": "./hqt_output/measure",
-}
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json b/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json
deleted file mode 100644
index 230884c3a1..0000000000
--- a/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "MEASURE",
-    "observer": "maxabs",
-    "measure_exclude": "NONE",
-    "dump_stats_path": "./hqt_output/measure"
-}
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/maxabs_quant.json b/examples/text-generation/quantization_config/maxabs_quant.json
deleted file mode 100644
index ce8bae27a8..0000000000
--- a/examples/text-generation/quantization_config/maxabs_quant.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "maxabs_hw",
-    "dump_stats_path": "./hqt_output/measure"
-}
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/maxabs_quant_mixtral.json b/examples/text-generation/quantization_config/maxabs_quant_mixtral.json
deleted file mode 100644
index 87dc52d08a..0000000000
--- a/examples/text-generation/quantization_config/maxabs_quant_mixtral.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "maxabs_hw",
-    "allowlist": {"types": [], "names":  ["gate","w1","w3","w2"]},
-    "blocklist": {"types": [], "names":  [
-        "model.layers.1.block_sparse_moe.experts.(3|4).w2",
-        "model.layers.[29-31].block_sparse_moe.experts.[0-7].w2"
-    ]},
-    "dump_stats_path": "./hqt_output/measure"
-}
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/maxabs_quant_phi.json b/examples/text-generation/quantization_config/maxabs_quant_phi.json
deleted file mode 100644
index e7c6b6ddd2..0000000000
--- a/examples/text-generation/quantization_config/maxabs_quant_phi.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "maxabs_hw",
-    "blocklist": {"types": [], "names":  [
-        "matmul_qk",
-        "matmul_av",
-        "lm_head"
-    ]},
-    "dump_stats_path": "./hqt_output/measure"
-}
diff --git a/examples/text-generation/quantization_config/unit_scale_quant.json b/examples/text-generation/quantization_config/unit_scale_quant.json
deleted file mode 100644
index 216cf27e68..0000000000
--- a/examples/text-generation/quantization_config/unit_scale_quant.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "method": "HOOKS",
-    "mode": "QUANTIZE",
-    "observer": "maxabs",
-    "scale_method": "unit_scale",
-    "dump_stats_path": "./hqt_output/measure"
-}
diff --git a/examples/text-generation/quantization_tools/unify_measurements.py b/examples/text-generation/quantization_tools/unify_measurements.py
deleted file mode 100644
index 0efc06c8db..0000000000
--- a/examples/text-generation/quantization_tools/unify_measurements.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import argparse
-import json
-import os
-import sys
-
-import numpy as np
-
-
-def find_measurement_path(measurement, measurements_dir_path, scales, group_size):
-    measurment_card = measurement + "_" + str(group_size)
-    for measurment_file in os.listdir(measurements_dir_path):
-        filename = os.fsdecode(measurment_file)
-        if not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename:
-            continue
-        if scales:
-            if "MAXABS" in filename:
-                return os.path.join(measurements_dir_path, measurment_file)
-        else:
-            if "MAXABS" not in filename:
-                return os.path.join(measurements_dir_path, measurment_file)
-
-
-def unify_measurements(
-    measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index, scales=False
-):
-    measurements_paths = []
-    group_name = ""
-
-    # save all the jsons paths in the given measurement group
-    for measurement in measurement_group:
-        measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, groups_size)
-        measurements_paths.append(measurement_path)
-        group_name += measurement
-
-    # save all the jsons content in the given measurement group
-    measurements_jsons = []
-    for measurement_path in measurements_paths:
-        with open(measurement_path, "r") as f:
-            js = json.load(f)
-            measurements_jsons.append(js["Nodes"])
-    # create a name for the unified json that will be created for this measurement group
-
-    if groups_num == 1:
-        unified_json_name = (
-            find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
-            .split("/")[-1]
-            .replace("_" + measurement_group[0] + "_" + str(groups_size), "")
-        )
-    else:
-        unified_json_name = (
-            find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
-            .split("/")[-1]
-            .replace(
-                "_" + measurement_group[0] + "_" + str(groups_size), "_" + str(group_index) + "_" + str(groups_num)
-            )
-        )
-    unified_json_path = os.path.join(output_path, unified_json_name)
-
-    # open a unified json file
-    with open(measurements_paths[0], "r") as origin, open(unified_json_path, "w") as copy:
-        copy.write(origin.read())
-    with open(unified_json_path, "r") as json_file:
-        unified_json = json.load(json_file)
-        unified_json["LocalRank"] = group_index if groups_num != 1 else -1
-
-    # iterate all unified json nodes
-    for node_name, node_values in unified_json["Nodes"].items():
-        max_inputs = node_values["inputs"]
-        max_outputs = None
-        if node_values.get("outputs") is not None:
-            max_outputs = node_values["outputs"]
-        max_weight = None
-        if node_values.get("params") is not None and node_values["params"].get("weight") is not None:
-            max_weight = node_values["params"]["weight"]
-
-        # iterate over all the measurment group and take the maximum for each tensor and its channel
-        if scales:
-            for measurement_json in measurements_jsons:
-                for i in range(0, len(max_inputs)):
-                    max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i])
-                if max_outputs is not None:
-                    max_outputs = max(measurement_json[node_name]["outputs"], max_outputs)
-                if max_weight is not None:
-                    max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
-        else:
-            for measurement_json in measurements_jsons:
-                for i in range(0, len(max_inputs)):
-                    for j in range(0, len(max_inputs[i])):
-                        max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
-                if max_outputs is not None:
-                    for i in range(0, len(max_outputs)):
-                        max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
-                if max_weight is not None:
-                    for i in range(0, len(max_weight)):
-                        max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
-
-        # update the maximum in the unified json
-        if scales:
-            for i in range(0, len(max_inputs)):
-                unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i]
-            if max_outputs is not None:
-                unified_json["Nodes"][node_name]["outputs"] = max_outputs
-            if max_weight is not None:
-                unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
-        else:
-            for i in range(0, len(max_inputs)):
-                for j in range(0, len(max_inputs[i])):
-                    unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
-            if max_outputs is not None:
-                for i in range(0, len(max_outputs)):
-                    unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
-            if max_weight is not None:
-                for i in range(0, len(max_weight)):
-                    unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
-    global_rank = None
-    local_rank = group_index if groups_num != 1 else -1
-    mode = ""
-    layers = {}
-    with open(unified_json_path, "w") as json_file:
-        json.dump(unified_json, json_file)
-    mode = unified_json["Mode"]
-    nodes = unified_json["Nodes"]
-
-    # create unified npz file from the unified json
-    unified_npz_path = os.path.join(output_path, unified_json_name.replace(".json", ".npz"))
-    for layer, dlayer in nodes.items():
-        layers[layer] = {}
-        layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]]
-        if dlayer.get("outputs") is not None:
-            layers[layer]["outputs"] = np.array(dlayer["outputs"])
-        if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None:
-            layers[layer]["params"] = {}
-            layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"])
-    df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers}
-    with open(unified_npz_path, "w"):
-        np.savez(unified_npz_path, df)
-
-
-def parse_args(args):
-    parser = argparse.ArgumentParser(
-        description="Run the measurements parser", formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        "-m", "--measurements", type=str, help="path to the directory of the measurements that will be unified"
-    )
-    parser.add_argument(
-        "-g",
-        "--groups",
-        type=list,
-        nargs="+",
-        help="groups of cards we want to unify, each group should be seperated by whitespace \
-                        - e.g. 01 23 45 67, card 0 measurement will be unified with card 1 measurement and so on",
-    )
-    parser.add_argument(
-        "-o",
-        "--out",
-        type=str,
-        default=os.getcwd(),
-        help="path to the directory where the unified measurements will be written",
-    )
-    return parser.parse_args(args)
-
-
-def main(args):
-    args = parse_args(args)
-    output_path = args.out
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    measurements_path = args.measurements
-    groups = args.groups
-
-    num_jsons_drange = 0
-    num_jsons_scales = 0
-    for path in os.listdir(measurements_path):
-        if path.endswith(".json"):
-            if "MAXABS" in path:
-                num_jsons_scales += 1
-            elif "mod_list" not in path:
-                num_jsons_drange += 1
-    assert (
-        os.path.isdir(measurements_path)
-        and (num_jsons_drange % len(groups)) == 0
-        and (num_jsons_scales % len(groups)) == 0
-    )
-
-    for group_index, group in enumerate(groups):
-        unify_measurements(
-            group, measurements_path, output_path, num_jsons_drange, len(groups), group_index, scales=False
-        )
-        unify_measurements(
-            group, measurements_path, output_path, num_jsons_scales, len(groups), group_index, scales=True
-        )
-
-    print("finished measurement unifier script")
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/examples/text-generation/requirements.txt b/examples/text-generation/requirements.txt
deleted file mode 100644
index fcf5474622..0000000000
--- a/examples/text-generation/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-datasets==2.19.2
-peft == 0.11.1
\ No newline at end of file
diff --git a/examples/text-generation/requirements_lm_eval.txt b/examples/text-generation/requirements_lm_eval.txt
deleted file mode 100644
index 8151dc4c9f..0000000000
--- a/examples/text-generation/requirements_lm_eval.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@master
-evaluate == 0.4.2
-rouge_score == 0.1.2
-accelerate == 0.31.0
-pandas <= 2.2.2
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
deleted file mode 100755
index ab5dace4ad..0000000000
--- a/examples/text-generation/run_generation.py
+++ /dev/null
@@ -1,895 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Conditional text generation on Habana Gaudi/Gaudi2.
-"""
-
-import argparse
-import json
-import logging
-import math
-import os
-import struct
-import time
-from itertools import cycle
-from pathlib import Path
-
-import pandas as pd
-import torch
-from utils import adjust_batch, count_hpu_graphs, finalize_quantization, initialize_model
-
-from optimum.habana.utils import get_hpu_memory_stats
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def setup_parser(parser):
-    class StoreTrueFalseAction(argparse.Action):
-        def __call__(self, parser, namespace, values, option_string=None):
-            if isinstance(values, bool) or values is None:
-                # Flag passed without any value -> set to True
-                setattr(namespace, self.dest, True)
-            else:
-                # Flag passed with value -> pattern match and set accordingly
-                value_str = values.lower()
-                if value_str in ("true", "1", "yes"):
-                    setattr(namespace, self.dest, True)
-                elif value_str in ("false", "0", "no"):
-                    setattr(namespace, self.dest, False)
-                else:
-                    raise ValueError(f"Invalid value for {option_string}: {values}")
-
-    # Arguments management
-    parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu")
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model (on the HF Hub or locally).",
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Whether to perform generation in bf16 precision.",
-    )
-    parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
-    parser.add_argument(
-        "--max_input_tokens",
-        type=int,
-        default=0,
-        help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \
-            if == 0, then truncate to 16 (original default) \
-            if < 0, then do not truncate, use full input prompt",
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
-    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
-    parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
-    parser.add_argument(
-        "--use_kv_cache",
-        action="store_true",
-        help="Whether to use the key/value cache for decoding. It should speed up generation.",
-    )
-    parser.add_argument(
-        "--use_hpu_graphs",
-        action="store_true",
-        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
-    )
-    parser.add_argument(
-        "--dataset_name",
-        default=None,
-        type=str,
-        help="Optional argument if you want to assess your model on a given dataset of the HF Hub.",
-    )
-    parser.add_argument(
-        "--dataset",
-        default="/mnt/weka/data/mlperf_inference/llama2/processed-data.pkl",
-        type=str,
-        help="path of the dataset to run rouge evaluation and measurement for rouge",
-    )
-    parser.add_argument(
-        "--column_name",
-        default=None,
-        type=str,
-        help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.",
-    )
-    parser.add_argument(
-        "--do_sample",
-        action="store_true",
-        help="Whether to use sampling for generation.",
-    )
-    parser.add_argument(
-        "--num_beams",
-        default=1,
-        type=int,
-        help="Number of beams used for beam search generation. 1 means greedy search will be performed.",
-    )
-    parser.add_argument(
-        "--trim_logits",
-        action="store_true",
-        help="Calculate logits only for the last token to save memory in the first step.",
-    )
-    parser.add_argument(
-        "--seed",
-        default=27,
-        type=int,
-        help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.",
-    )
-    parser.add_argument(
-        "--profiling_warmup_steps",
-        default=0,
-        type=int,
-        help="Number of steps to ignore for profiling.",
-    )
-    parser.add_argument(
-        "--profiling_steps",
-        default=0,
-        type=int,
-        help="Number of steps to capture for profiling.",
-    )
-    parser.add_argument(
-        "--profiling_record_shapes",
-        default=False,
-        type=bool,
-        help="Record shapes when enabling profiling.",
-    )
-    parser.add_argument(
-        "--prompt",
-        default=None,
-        type=str,
-        nargs="*",
-        help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")',
-    )
-    parser.add_argument(
-        "--bad_words",
-        default=None,
-        type=str,
-        nargs="+",
-        help="Optional argument list of words that are not allowed to be generated.",
-    )
-    parser.add_argument(
-        "--force_words",
-        default=None,
-        type=str,
-        nargs="+",
-        help="Optional argument list of words that must be generated.",
-    )
-    parser.add_argument(
-        "--assistant_model",
-        default=None,
-        type=str,
-        help="Optional argument to give a path to a draft/assistant model for assisted decoding.",
-    )
-    parser.add_argument(
-        "--peft_model",
-        default=None,
-        type=str,
-        help="Optional argument to give a path to a PEFT model.",
-    )
-    parser.add_argument("--num_return_sequences", type=int, default=1)
-    parser.add_argument(
-        "--token",
-        default=None,
-        type=str,
-        help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-        "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
-    )
-    parser.add_argument(
-        "--model_revision",
-        default="main",
-        type=str,
-        help="The specific model version to use (can be a branch name, tag name or commit id).",
-    )
-    parser.add_argument(
-        "--attn_softmax_bf16",
-        action="store_true",
-        help="Whether to run attention softmax layer in lower precision provided that the model supports it and "
-        "is also running in lower precision.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        help="Output directory to store results in.",
-    )
-    parser.add_argument(
-        "--bucket_size",
-        default=-1,
-        type=int,
-        help="Bucket size to maintain static shapes. If this number is negative (default is -1) \
-            then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \
-            we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).",
-    )
-    parser.add_argument(
-        "--bucket_internal",
-        action="store_true",
-        help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.",
-    )
-    parser.add_argument(
-        "--dataset_max_samples",
-        default=-1,
-        type=int,
-        help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.",
-    )
-    parser.add_argument(
-        "--limit_hpu_graphs",
-        action="store_true",
-        help="Skip HPU Graph usage for first token to save memory",
-    )
-    parser.add_argument(
-        "--reuse_cache",
-        action="store_true",
-        help="Whether to reuse key/value cache for decoding. It should save memory.",
-    )
-    parser.add_argument("--verbose_workers", action="store_true", help="Enable output from non-master workers")
-    parser.add_argument(
-        "--simulate_dyn_prompt",
-        default=None,
-        type=int,
-        nargs="*",
-        help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.",
-    )
-    parser.add_argument(
-        "--reduce_recompile",
-        action="store_true",
-        help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
-    )
-
-    parser.add_argument("--gptq", action="store_true", help="Enable Quantization to 4 bit with AutoGPTQ")
-
-    parser.add_argument(
-        "--use_flash_attention",
-        nargs="?",
-        const=True,
-        default=False,
-        action=StoreTrueFalseAction,
-        help="Whether to enable Habana Flash Attention, provided that the model supports it.",
-    )
-    parser.add_argument(
-        "--flash_attention_recompute",
-        nargs="?",
-        const=True,
-        default=False,
-        action=StoreTrueFalseAction,
-        help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
-    )
-    parser.add_argument(
-        "--flash_attention_causal_mask",
-        nargs="?",
-        const=True,
-        default=False,
-        action=StoreTrueFalseAction,
-        help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
-    )
-    parser.add_argument(
-        "--flash_attention_fast_softmax",
-        nargs="?",
-        const=None,  # Default value handled post-parsing
-        action=StoreTrueFalseAction,
-        help="Whether to enable Habana Flash Attention in fast softmax mode.",
-    )
-    parser.add_argument(
-        "--book_source",
-        action="store_true",
-        help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.",
-    )
-    parser.add_argument(
-        "--torch_compile",
-        action="store_true",
-        help="Whether to use torch compiled model or not.",
-    )
-    parser.add_argument(
-        "--ignore_eos",
-        default=True,
-        action=argparse.BooleanOptionalAction,
-        help="Whether to ignore eos, set False to disable it",
-    )
-    parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation")
-    parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling")
-    parser.add_argument(
-        "--const_serialization_path",
-        "--csp",
-        type=str,
-        help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
-    )
-    parser.add_argument(
-        "--disk_offload",
-        action="store_true",
-        help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.",
-    )
-    parser.add_argument(
-        "--trust_remote_code",
-        action="store_true",
-        help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
-    )
-    parser.add_argument(
-        "--run_partial_dataset",
-        action="store_true",
-        help="Run the inference with dataset for specified --n_iterations(default:5)",
-    )
-    parser.add_argument(
-        "--load_cp",
-        action="store_true",
-        help="Whether to load model from hugging face checkpoint.",
-    )
-    parser.add_argument(
-        "--parallel_strategy",
-        type=str,
-        choices=["tp", "none"],  # Add other strategies as needed
-        default="none",
-        help="Run multi card with the specified parallel strategy. Choices are 'tp' for Tensor Parallel Strategy or 'none'.",
-    )
-
-    args = parser.parse_args()
-
-    if args.torch_compile:
-        args.use_hpu_graphs = False
-
-    if not args.use_hpu_graphs:
-        args.limit_hpu_graphs = False
-
-    if args.flash_attention_fast_softmax is None:
-        args.flash_attention_fast_softmax = args.use_flash_attention
-
-    args.quant_config = os.getenv("QUANT_CONFIG", "")
-    if args.quant_config and args.gptq:
-        raise RuntimeError("Setting both quant_config and gptq is unsupported. ")
-
-    if args.quant_config == "" and args.disk_offload:
-        logger.warning(
-            "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."
-        )
-    return args
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    args = setup_parser(parser)
-    model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
-
-    use_lazy_mode = True
-    if args.torch_compile:
-        use_lazy_mode = False
-
-    import habana_frameworks.torch.hpu as torch_hpu
-
-    if args.dataset_name == "openorca":
-        # Benchmark over the prompts below
-        def get_ds(args):
-            ds = pd.read_pickle(args.dataset)
-            return ds
-
-        def get_input(ds, batch_size):
-            queries = []
-            tok_input = ds["tok_input"].tolist()
-            for start in range(0, len(ds), batch_size):
-                end = start + batch_size
-                batch = tok_input[start:end]
-                input_ids = []
-                attention_mask = []
-                for query in batch:
-                    input_ids.append([0] * (args.max_input_tokens - len(query)) + query)
-                    attention_mask.append([0] * (args.max_input_tokens - len(query)) + [1] * len(query))
-                queries.append(
-                    {
-                        "input_ids": torch.tensor(input_ids, dtype=torch.int32),
-                        "attention_mask": torch.tensor(attention_mask, dtype=torch.int32),
-                    }
-                )
-            return queries
-
-        ds = get_ds(args)
-        input_sentences = get_input(ds, args.batch_size)
-
-        def generate(input_tokens, size=None, reduce_recompile=False):
-            """Generates sequences from the input sentences and returns them."""
-
-            t0 = time.perf_counter()
-            print(f"Step4+ starting time is {t0*1000}", flush=True)
-            if size is not None:
-                input_tokens = adjust_batch(input_tokens, size)
-
-            if not reduce_recompile:
-                # Move inputs to target device(s)
-                for t in input_tokens:
-                    if torch.is_tensor(input_tokens[t]):
-                        input_tokens[t] = input_tokens[t].to(args.device)
-
-            outputs = model.generate(
-                **input_tokens,
-                generation_config=generation_config,
-                lazy_mode=use_lazy_mode,
-                hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
-            ).cpu()
-            outputs = outputs.tolist()
-            for i in range(len(outputs)):
-                outputs[i] = outputs[i][args.max_input_tokens :]
-            duration = time.perf_counter() - t0
-            print(f"Total E2E time of this batch is {duration:.3f}s", flush=True)
-            return outputs
-
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
-        # Compilation
-        logger.info("Graph compilation...")
-        dyn_prompt_lens = args.simulate_dyn_prompt
-        t0 = time.perf_counter()
-        # The first three iterations take longer because of graph compilation
-        if dyn_prompt_lens is None or len(set(dyn_prompt_lens)) == 1:
-            for _ in range(args.warmup):
-                if dyn_prompt_lens is None:
-                    print("Warming up", flush=True)
-                    generate(input_sentences[0], None, args.reduce_recompile)
-                else:
-                    print("Warming up for shape,", dyn_prompt_lens[0], flush=True)
-                    generate(input_sentences[0], dyn_prompt_lens[0], args.reduce_recompile)
-        else:
-            if args.bucket_size > 0:
-                mn = min(dyn_prompt_lens)
-                mx = max(dyn_prompt_lens)
-
-                def rounder(x):
-                    return int(math.ceil(x / args.bucket_size) * args.bucket_size)
-
-                min_prompt_len = rounder(mn)
-                max_sentence_len = rounder(mx)
-                for _ in range(args.warmup):
-                    lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
-                    for sz in lst:
-                        print("Warming up for shape,", sz - 1, flush=True)
-                        generate(input_sentences[0], sz - 1, args.reduce_recompile)
-        torch_hpu.synchronize()
-        compilation_duration = time.perf_counter() - t0
-        HabanaProfile.enable()
-        total_new_tokens_generated = 0
-        logger.info("Running generate...")
-        t0 = time.perf_counter()
-        # Benchmark over n_iterations iterations
-        N = len(input_sentences)
-        if dyn_prompt_lens is None:
-            for i in range(args.n_iterations):
-                results = []
-                b = 1
-                for sentence in input_sentences:
-                    generated = generate(sentence, None, args.reduce_recompile)
-                    results.extend(generated)
-                    print(f"Generatig batch {b}/{N}")
-                    b += 1
-        else:
-            repeated_prompt_len = cycle(dyn_prompt_lens)
-            for i in range(args.n_iterations):
-                prompt_len = next(repeated_prompt_len)
-                print("Generating for shape,", prompt_len)
-                results = []
-                for sentence in input_sentences:
-                    generated = generate(sentence, prompt_len, args.reduce_recompile)
-                    results.extend(generated)
-        duration = time.perf_counter() - t0
-        total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
-        throughput = total_new_tokens_generated / duration
-
-        # Store results if necessary
-        if args.output_dir is not None and args.global_rank == 0:
-            output_dir = Path(args.output_dir)
-            output_dir.mkdir(parents=True, exist_ok=True)
-
-            # TODO dump in hex format
-            acc_file = []
-            num_token = 0
-            for i, idx in enumerate(ds.index):
-                pred = results[i]
-                eos_token_id = 2
-                try:
-                    ind_eos = pred.index(eos_token_id) + 1
-                except:  # noqa
-                    ind_eos = len(pred)
-                pred = pred[:ind_eos]
-                num_token += len(pred)
-                acc_file.append(
-                    {"seq_id": idx, "qsl_idx": idx, "data": bytes(struct.pack("L" * len(pred), *pred)).hex().upper()}
-                )
-            with open(output_dir / "accuracy.json", "w") as outfile:
-                outfile.write(json.dumps(acc_file))
-
-        stats = f"Throughput (including tokenization) = {throughput} tokens/second"
-        stats = stats + f"\nNumber of HPU graphs                = {count_hpu_graphs()}"
-        separator = "-" * len(stats)
-        print()
-        print("Stats:")
-        print(separator)
-        print(stats)
-        mem = get_hpu_memory_stats()
-        for k, v in mem.items():
-            print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
-        print(f"Graph compilation duration          = {compilation_duration} seconds")
-        print(separator)
-        print()
-    elif args.dataset_name is None:
-        # Benchmark over the prompts below
-        if args.prompt:
-            input_sentences = args.prompt
-        elif args.book_source:
-
-            def download_book(book_id):
-                import os
-
-                import requests
-
-                url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
-                response = requests.get(url)
-                if response.status_code == 200:
-                    pid = os.getpid()
-                    save_path = f"/tmp/{book_id}_{pid}.txt"
-                    with open(save_path, "wb") as file:
-                        file.write(response.content)
-                    print(f"Book downloaded and saved to: {save_path}")
-                    return save_path
-                else:
-                    print("Failed to download book! Exiting...")
-                    import sys
-
-                    sys.exit()
-
-            def assemble_prompt(prompt_size, book_path):
-                prompt = ""
-                counter = 0
-                book_lines = open(book_path).readlines()
-                for line in book_lines:
-                    for word in line.split():
-                        counter += 1
-                        prompt += word + " "
-                        if counter == prompt_size:
-                            return [prompt] * args.batch_size
-
-            book_ids = [
-                2701,  # Moby Dick; Or, The Whale
-                1513,  # Romeo and Juliet
-                1342,  # Pride and Prejudice
-            ]
-            input_sentences = assemble_prompt(prompt_size=args.max_input_tokens, book_path=download_book(book_ids[0]))
-        else:
-            input_sentences = [
-                "DeepSpeed is a machine learning framework",
-                "He is working on",
-                "He has a",
-                "He got all",
-                "Everyone is happy and I can",
-                "The new movie that got Oscar this year",
-                "In the far far distance from our galaxy,",
-                "Peace is the only way",
-            ]
-
-        if args.batch_size > len(input_sentences):
-            # Dynamically extends to support larger batch sizes
-            num_sentences_to_add = args.batch_size - len(input_sentences)
-            for i in range(num_sentences_to_add):
-                input_sentences.append(input_sentences[i % len(input_sentences)])
-        elif args.batch_size < len(input_sentences):
-            input_sentences = input_sentences[: args.batch_size]
-
-        def generate(size=None, reduce_recompile=False):
-            """Generates sequences from the input sentences and returns them."""
-            encode_t0 = time.perf_counter()
-            t0 = time.perf_counter()
-            print(f"Step4+ starting time is {t0*1000}", flush=True)
-            # Tokenization
-            if args.max_input_tokens > 0:
-                input_tokens = tokenizer.batch_encode_plus(
-                    input_sentences,
-                    return_tensors="pt",
-                    padding="max_length",
-                    max_length=args.max_input_tokens,
-                    truncation=True,
-                )
-                def compute_valid_sequence_lengths_tensor(input_tokens):
-                    attn_mask = input_tokens['attention_mask']
-                    return torch.sum(attn_mask, dim=1)
-                valid_sequence_lengths = compute_valid_sequence_lengths_tensor(input_tokens).to(args.device)
-                generation_config.valid_sequence_lengths = valid_sequence_lengths
-            else:
-                input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True)
-            encode_duration = time.perf_counter() - encode_t0
-
-            if size is not None:
-                input_tokens = adjust_batch(input_tokens, size)
-            if not reduce_recompile:
-                # Move inputs to target device(s)
-                for t in input_tokens:
-                    if torch.is_tensor(input_tokens[t]):
-                        input_tokens[t] = input_tokens[t].to(args.device)
-            iteration_times = []
-            output_tokens = model.generate(
-                **input_tokens,
-                generation_config=generation_config,
-                assistant_model=assistant_model,
-                lazy_mode=use_lazy_mode,
-                hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
-                ignore_eos=args.ignore_eos,
-                iteration_times=iteration_times,
-                profiling_record_shapes=args.profiling_record_shapes,
-            ).cpu()
-            outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
-            duration = time.perf_counter() - t0
-            first_token_time = iteration_times[0] + encode_duration
-            logger.info(f"Time to first token = {first_token_time*1000}ms")
-            print(f"Total E2E time of this iteration is {duration:.3f}s", flush=True)
-            return outputs
-
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
-        # Compilation
-        logger.info("Graph compilation...")
-        dyn_prompt_lens = args.simulate_dyn_prompt
-        t0 = time.perf_counter()
-        # The first three iterations take longer because of graph compilation
-        if dyn_prompt_lens is None or len(set(dyn_prompt_lens)) == 1:
-            for _ in range(args.warmup):
-                if dyn_prompt_lens is None:
-                    print("Warming up", flush=True)
-                    generate(None, args.reduce_recompile)
-                else:
-                    print("Warming up for shape,", dyn_prompt_lens[0], flush=True)
-                    generate(dyn_prompt_lens[0], args.reduce_recompile)
-        else:
-            if args.bucket_size > 0:
-                mn = min(dyn_prompt_lens)
-                mx = max(dyn_prompt_lens)
-
-                def rounder(x):
-                    return int(math.ceil(x / args.bucket_size) * args.bucket_size)
-
-                min_prompt_len = rounder(mn)
-                max_sentence_len = rounder(mx)
-                for _ in range(args.warmup):
-                    lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
-                    for sz in lst:
-                        print("Warming up for shape,", sz - 1, flush=True)
-                        generate(sz - 1, args.reduce_recompile)
-        torch_hpu.synchronize()
-        compilation_duration = time.perf_counter() - t0
-        HabanaProfile.enable()
-        total_new_tokens_generated = 0
-        logger.info("Running generate...")
-        t0 = time.perf_counter()
-        # Benchmark over n_iterations iterations
-        if dyn_prompt_lens is None:
-            for i in range(args.n_iterations):
-                generated = generate(None, args.reduce_recompile)
-        else:
-            repeated_prompt_len = cycle(dyn_prompt_lens)
-            for i in range(args.n_iterations):
-                prompt_len = next(repeated_prompt_len)
-                print("Generating for shape,", prompt_len)
-                generated = generate(prompt_len, args.reduce_recompile)
-        duration = time.perf_counter() - t0
-        total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
-        throughput = total_new_tokens_generated / duration
-
-        print()
-        print("Input/outputs:")
-        for i, input_sentence in enumerate(zip(input_sentences)):
-            print(f"input {i+1}: {input_sentence}")
-            for j, output in enumerate(
-                zip(generated[args.num_return_sequences * i : args.num_return_sequences * (i + 1)])
-            ):
-                print(f"output {j+1}: {output}")
-            print()
-
-        # Store results if necessary
-        if args.output_dir is not None and args.global_rank == 0:
-            output_dir = Path(args.output_dir)
-            output_dir.mkdir(parents=True, exist_ok=True)
-
-            results = {
-                "throughput": throughput,
-                "output": output,
-            }
-            with (output_dir / "results.json").open("w", encoding="utf-8") as f:
-                json.dump(results, f, ensure_ascii=False, indent=4)
-
-        stats = f"Throughput (including tokenization) = {throughput} tokens/second"
-        stats = stats + f"\nNumber of HPU graphs                = {count_hpu_graphs()}"
-        separator = "-" * len(stats)
-        print()
-        print("Stats:")
-        print(separator)
-        print(stats)
-        mem = get_hpu_memory_stats()
-        for k, v in mem.items():
-            print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
-        print(f"Graph compilation duration          = {compilation_duration} seconds")
-        print(separator)
-        print()
-    else:
-        # Downloading and loading a dataset from the hub.
-        from datasets import load_dataset
-        from torch.utils.data import DataLoader
-
-        assert not args.simulate_dyn_prompt, "Both dataset_name and simulate_dyn_prompt are set"
-
-        raw_dataset = load_dataset(args.dataset_name)
-        if "test" in raw_dataset:
-            split = "test"
-        elif "validation" in raw_dataset:
-            split = "validation"
-        else:
-            split = "train"
-        raw_dataset = (
-            raw_dataset[split]
-            .shuffle()
-            .select(range(args.dataset_max_samples if args.dataset_max_samples > 0 else (raw_dataset[split]).num_rows))
-        )
-
-        if args.column_name is None:
-            # If no column name is given, take the first column that has strings
-            column_name = [key for key in raw_dataset.features.keys() if raw_dataset.features[key].dtype == "string"][
-                0
-            ]
-            logger.info(
-                f"No column name was given so automatically choosing '{column_name}' for prompts. If you would like to use another column of the dataset, you can set the argument `--column_name`."
-            )
-        else:
-            column_name = args.column_name
-
-        # Remove unused columns
-        raw_dataset = raw_dataset.remove_columns([name for name in raw_dataset.column_names if name != column_name])
-
-        # Set the prompt length to args.max_input_tokens if > 0 else (if 0 truncate to 16, otherwise use full length)
-        prompt_length = args.max_input_tokens if args.max_input_tokens > 0 else (-1, 16)[args.max_input_tokens == 0]
-
-        def preprocess_function(examples):
-            # Tokenize the texts
-            return tokenizer(
-                examples[column_name],
-                padding="max_length",
-                max_length=prompt_length if prompt_length > 0 else None,
-                truncation=prompt_length > 0,
-            )
-
-        raw_dataset = raw_dataset.map(
-            preprocess_function,
-            batched=True,
-            desc="Running tokenizer on dataset",
-        )
-        # After tokenization, we can remove the column of interest
-        raw_dataset = raw_dataset.remove_columns([column_name])
-        raw_dataset.set_format(type="torch")
-
-        if prompt_length <= 0:
-            # Todo please check if this collate function is suitable for your model
-            # This has been tested for OPT, llama, and Bloom
-            assert model.config.model_type in ["opt", "bloom", "llama"]
-
-            def collate_fn(data):
-                collect = {k: [dt[k] for dt in data] for k in data[0]}
-                result = {}
-                for k in collect:
-                    tensors = collect[k]
-                    max_shape = max([item.shape[0] for item in tensors])
-                    result[k] = torch.stack(
-                        [torch.cat((torch.zeros(max_shape - t.shape[0], dtype=t.dtype), t)) for t in tensors], 0
-                    )
-                return result
-
-        else:
-            collate_fn = None
-
-        dataloader = DataLoader(raw_dataset, batch_size=args.batch_size, collate_fn=collate_fn)
-
-        def generate_dataset(batch):
-            prompt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
-            # Move inputs to target device(s)
-            for t in batch:
-                if torch.is_tensor(batch[t]):
-                    batch[t] = batch[t].to(args.device)
-            # Generate new sequences
-            outputs = model.generate(
-                **batch,
-                generation_config=generation_config,
-                lazy_mode=use_lazy_mode,
-                hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
-                ignore_eos=args.ignore_eos,
-                profiling_record_shapes=args.profiling_record_shapes,
-            ).cpu()
-            return prompt, outputs
-
-        # warmup
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
-        # Compilation
-        logger.info("Graph compilation...")
-        t0 = time.perf_counter()
-        for i, batch in enumerate(dataloader):
-            print("Warming up", flush=True)
-            t0 = time.perf_counter()
-            print(f"Step4+ starting time is {t0*1000}", flush=True)
-            generate_dataset(batch)
-            duration = time.perf_counter() - t0
-            print(f"Total E2E time of this iteration is {duration:.3f}s", flush=True)
-            # The first three iterations take longer because of graph compilation
-            if (i + 1) == 3:
-                break
-        torch_hpu.synchronize()
-        compilation_duration = time.perf_counter() - t0
-        HabanaProfile.enable()
-
-        total_new_tokens_generated = 0
-        duration = 0
-        separator = "-" * 50
-        logger.info("Running generate dataset...")
-        t_start = time.time()
-        for i, batch in enumerate(dataloader):
-            t0 = time.perf_counter()
-            prompt, outputs = generate_dataset(batch)
-            duration += time.perf_counter() - t0
-            total_new_tokens_generated += args.batch_size * args.max_new_tokens
-            print(separator)
-            print(f"Batch n°{i+1}")
-            print(f"Input: {prompt[:args.batch_size]}")
-            print(
-                f"Output: {tokenizer.batch_decode(outputs, skip_special_tokens=True)[:args.batch_size*args.num_return_sequences]}"
-            )
-            print(separator)
-            if args.run_partial_dataset and args.n_iterations == i+1:
-                break
-        t_end = time.time()
-
-        throughput = total_new_tokens_generated / duration
-        # Print Stats
-
-        stats = f"Throughput (including tokenization) = {throughput} tokens/second"
-        separator = "-" * len(stats)
-        print()
-        print("Stats:")
-        print(separator)
-        print(stats)
-        print("Total runtime for dataset:", t_end - t_start)
-        mem = get_hpu_memory_stats()
-        for k, v in mem.items():
-            print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
-        if prompt_length > 0:
-            print(f"Graph compilation duration          = {compilation_duration} seconds")
-        print(separator)
-    if args.quant_config:
-        finalize_quantization(model)
-    if args.const_serialization_path and os.path.isdir(args.const_serialization_path):
-        import shutil
-
-        shutil.rmtree(args.const_serialization_path)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
deleted file mode 100644
index de4f658858..0000000000
--- a/examples/text-generation/run_lm_eval.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-###############################################################################
-# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-import argparse
-import json
-import logging
-import os
-import time
-
-import lm_eval.evaluator
-import lm_eval.tasks
-import torch
-import torch.nn.functional as F
-from run_generation import setup_parser
-from utils import finalize_quantization, initialize_model
-
-from optimum.habana.utils import get_hpu_memory_stats
-
-
-os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-os.environ.setdefault("HF_DATASETS_TRUST_REMOTE_CODE", "true")
-logger = logging.getLogger(__name__)
-
-import multiprocessing as mp
-import psutil
-
-# This hack is a workaround to limitations of lm_eval which always allocates
-# mp.Pool with max cpu count which explodes on multinode scenarios and for hpu
-# create multiprocess with spawn context
-OrigPool = mp.Pool
-def LimitedSpawnPool(_):
-    spawn_context = mp.get_context("spawn")
-    physical_cpu_count = psutil.cpu_count(logical=False)
-    pool_size = physical_cpu_count
-    world_size = int(os.getenv("WORLD_SIZE", 1))
-    if world_size == 0:
-        world_size = 1
-    pool_size //= world_size
-    if (pool_size * world_size) != physical_cpu_count:
-        pool_size -= 1
-    return spawn_context.Pool(pool_size)
-mp.Pool = LimitedSpawnPool
-
-
-def setup_lm_eval_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Evaluation script for HPU"
-    )
-    parser.add_argument(
-        "--buckets",
-        type=int,
-        nargs="+",
-        help="Input length buckets to use with static_shapes",
-        default=[16, 32, 64, 128, 189, 284],
-    )
-
-    parser.add_argument(
-        "--output_file", "-o", type=str, help="Output file with end results and runtime parameters", required=True
-    )
-    parser.add_argument(
-        "--tasks",
-        type=str,
-        nargs="+",
-        help="Tasks to run",
-        default=["hellaswag", "lambada_openai", "piqa", "winogrande"],
-    )
-    parser.add_argument("--limit_iters", type=int, help="limit examples to run that many iterations", default=None)
-    args = setup_parser(parser)
-
-    return args
-
-
-class HabanaModelAdapter(lm_eval.base.BaseLM):
-    def __init__(self, tokenizer, model, args, options):
-        super().__init__()
-        self.tokenizer = tokenizer
-        self.model = model
-        self._batch_size = args.batch_size
-        self.buckets = sorted(args.buckets)
-        self.options = options
-        self._device = args.device
-        self.model_inputs = {"use_cache": self.options.use_cache}
-        if self.model.config.model_type in ["llama", "mistral", "falcon", "phi", "mixtral", "qwen2"]:
-            self.model_inputs.update(
-                {
-                    "reuse_cache": self.options.reuse_cache,
-                }
-            )
-        if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon"]:
-            if self.model.config.model_type != "falcon":
-                self.model_inputs.update(
-                    {
-                        "attn_softmax_bf16": self.options.attn_softmax_bf16,
-                    }
-                )
-            self.model_inputs.update(
-                {
-                    "use_flash_attention": self.options.use_flash_attention,
-                    "flash_attention_recompute": self.options.flash_attention_recompute,
-                    "flash_attention_causal_mask": self.options.flash_attention_causal_mask,
-                }
-            )
-        if args.warmup:
-            self.warm_up()
-
-    def warm_up(self):
-        for bucket_size in reversed(self.buckets):
-            inps = torch.ones((self._batch_size, bucket_size), dtype=torch.int64)
-            self._model_call(inps)
-            pass
-
-    @property
-    def eot_token_id(self):
-        return self.model.config.eos_token_id
-
-    @property
-    def max_length(self):
-        return self.buckets[-1]
-
-    @property
-    def max_gen_toks(self):
-        raise NotImplementedError()
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def device(self):
-        # We need to do padding ourselves, otherwise we'll end up with recompilations
-        # Returning 'cpu' to keep tensors on CPU in lm_eval code
-        return "cpu"
-
-    def tok_encode(self, string):
-        return self.tokenizer.encode(string)
-
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
-
-    def _model_generate(self, context, max_length, eos_token_id):
-        raise NotImplementedError()
-
-    def find_bucket(self, length):
-        return [b for b in self.buckets if b >= length][0]
-
-    def _model_call(self, inps):
-        bs, seq_length = inps.shape
-        padding_length = 0
-        if self.options.static_shapes:
-            bucket_length = self.find_bucket(seq_length)
-            if self.options.use_cache and self.options.reuse_cache:
-                self.model.allocate_kv_cache(bs, bucket_length + 1, bucket_length)
-            padding_length = bucket_length - seq_length
-            inps = F.pad(inps, (0, padding_length), value=self.model.config.pad_token_id)
-        logits = self.model(inps.to(self._device), **self.model_inputs)["logits"].cpu()
-
-        if self.options.static_shapes and padding_length > 0:
-            logits = logits[:, :-padding_length, :]
-        logits = logits.to(torch.float32)
-        return logits
-
-
-def main():
-    args = setup_lm_eval_parser()
-    model, _, tokenizer, generation_config = initialize_model(args, logger)
-
-    lm_tasks = lm_eval.tasks.get_task_dict(args.tasks)
-    with torch.no_grad():
-        lm = HabanaModelAdapter(tokenizer, model, args, generation_config)
-
-    eval_start = time.perf_counter()
-    results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit_iters)
-    if args.device == "hpu":
-        import habana_frameworks.torch.hpu as torch_hpu
-
-        torch_hpu.synchronize()
-    eval_end = time.perf_counter()
-
-    results["args"] = vars(args)
-    results["duration"] = eval_end - eval_start
-
-    if args.local_rank == 0:
-        if args.device == "hpu":
-            mem = get_hpu_memory_stats()
-            for k, v in mem.items():
-                print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
-        json.dump(results, open(args.output_file, "w"), indent=2)
-        print(json.dumps(results, indent=2))
-    if args.quant_config:
-        finalize_quantization(model)
-
-    if args.const_serialization_path and os.path.isdir(args.const_serialization_path):
-        import shutil
-
-        shutil.rmtree(args.const_serialization_path)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/text-generation/text-generation-pipeline/README.md b/examples/text-generation/text-generation-pipeline/README.md
deleted file mode 100644
index c42622f5d2..0000000000
--- a/examples/text-generation/text-generation-pipeline/README.md
+++ /dev/null
@@ -1,153 +0,0 @@
-<!---
-Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Text-Generation Pipeline
-
-The text-generation pipeline can be used to perform text-generation by providing single or muliple prompts as input.
-
-## Requirements
-
-Update `PYTHONPATH` as follows.
-```bash
-export OPTIMUM_HABANA_PATH=/path/to/optimum-habana
-export PYTHONPATH=${PYTHONPATH}:${OPTIMUM_HABANA_PATH}/examples/text-generation
-```
-
-If you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html), you should install DeepSpeed as follows:
-```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
-```
-
-If you would like to use the pipeline with LangChain classes, you can install LangChain as follows:
-```bash
-pip install langchain==0.1.16
-```
-
-## Usage
-
-To run generation with DeepSpeed-inference, you must launch the script as follows:
-
-```bash
-python ../../gaudi_spawn.py --use_deepspeed --world_size number_of_devices run_pipeline.py ARGS
-```
-
-Without DeepSpeed-inference, you can run the script with:
-
-```bash
-python run_pipeline.py ARGS
-```
-
-The list of all possible arguments can be obtained running:
-```bash
-python run_pipeline.py --help
-```
-
-
-### Single and multiple prompts
-
-If you want to generate a sequence of text from a prompt of your choice, you should use the `--prompt` argument.
-For example:
-```
-python run_pipeline.py \
---model_name_or_path meta-llama/Llama-2-7b-hf \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---do_sample \
---prompt "Here is my prompt"
-```
-
-If you want to provide several prompts as inputs, here is how to do it:
-```
-python run_pipeline.py \
---model_name_or_path meta-llama/Llama-2-7b-hf \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---do_sample \
---batch_size 2 \
---prompt "Hello world" "How are you?"
-```
-
-If you want to perform generation on default prompts, do not pass the `--prompt` argument.
-```
-python run_pipeline.py \
---model_name_or_path meta-llama/Llama-2-7b-hf \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---do_sample
-```
-
-If you want to change the temperature and top_p values, make sure to include the `--do_sample` argument. Here is a sample command.
-```
-python run_pipeline.py \
---model_name_or_path meta-llama/Llama-2-7b-hf \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---do_sample \
---temperature 0.5 \
---top_p 0.95 \
---batch_size 2 \
---prompt "Hello world" "How are you?"
-```
-
-### Multi-card runs
-
-To run a large model such as Llama-2-70b via DeepSpeed, run the following command.
-```
-python ../../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---max_new_tokens 100 \
---bf16 \
---use_hpu_graphs \
---use_kv_cache \
---batch_size 4 \
---prompt "Hello world" "How are you?" "Here is my prompt" "Once upon a time"
-```
-
-To change the temperature and top_p values, run the following command.
-```
-python ../../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
---model_name_or_path meta-llama/Llama-2-70b-hf \
---max_new_tokens 100 \
---bf16 \
---use_hpu_graphs \
---use_kv_cache \
---do_sample \
---temperature 0.5 \
---top_p 0.95 \
---batch_size 4 \
---prompt "Hello world" "How are you?" "Here is my prompt" "Once upon a time"
-```
-
-### Usage with LangChain
-
-To run a Q&A example with LangChain, use the script `run_pipeline_langchain.py`. It supports a similar syntax to `run_pipeline.py`. For example, you can use following command:
-```
-python run_pipeline_langchain.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --bf16 \
-    --use_hpu_graphs \
-    --use_kv_cache \
-    --batch_size 32 \
-    --max_new_tokens 1024 \
-    --do_sample \
-    --device=hpu
-```
-
-> The pipeline class has been validated for LangChain version 0.1.16 and may not work with other versions of the package.
diff --git a/examples/text-generation/text-generation-pipeline/pipeline.py b/examples/text-generation/text-generation-pipeline/pipeline.py
deleted file mode 100644
index 1a5a999249..0000000000
--- a/examples/text-generation/text-generation-pipeline/pipeline.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import torch
-from transformers import TextGenerationPipeline
-from utils import initialize_model
-
-
-class GaudiTextGenerationPipeline(TextGenerationPipeline):
-    def __init__(self, args, logger, use_with_langchain=False, warmup_on_init=True):
-        self.model, self.tokenizer, self.generation_config = initialize_model(args, logger)
-
-        self.task = "text-generation"
-        self.device = args.device
-
-        if args.do_sample:
-            self.generation_config.temperature = args.temperature
-            self.generation_config.top_p = args.top_p
-
-        self.max_padding_length = args.max_input_tokens if args.max_input_tokens > 0 else 100
-        self.use_hpu_graphs = args.use_hpu_graphs
-        self.profiling_steps = args.profiling_steps
-        self.profiling_warmup_steps = args.profiling_warmup_steps
-        self.profiling_record_shapes = args.profiling_record_shapes
-
-        self.use_with_langchain = use_with_langchain
-        if self.use_with_langchain:
-            self.generation_config.ignore_eos = False
-
-        if warmup_on_init:
-            import habana_frameworks.torch.hpu as torch_hpu
-
-            logger.info("Graph compilation...")
-
-            warmup_promt = ["Here is my prompt"] * args.batch_size
-            for _ in range(args.warmup):
-                _ = self(warmup_promt)
-            torch_hpu.synchronize()
-
-    def __call__(self, prompt):
-        use_batch = isinstance(prompt, list)
-
-        if use_batch:
-            model_inputs = self.tokenizer.batch_encode_plus(
-                prompt, return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True
-            )
-        else:
-            model_inputs = self.tokenizer.encode_plus(
-                prompt, return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True
-            )
-
-        for t in model_inputs:
-            if torch.is_tensor(model_inputs[t]):
-                model_inputs[t] = model_inputs[t].to(self.device)
-
-        output = self.model.generate(
-            **model_inputs,
-            generation_config=self.generation_config,
-            lazy_mode=True,
-            hpu_graphs=self.use_hpu_graphs,
-            profiling_steps=self.profiling_steps,
-            profiling_warmup_steps=self.profiling_warmup_steps,
-            profiling_record_shapes=self.profiling_record_shapes,
-        ).cpu()
-
-        if use_batch:
-            output_text = self.tokenizer.batch_decode(output, skip_special_tokens=True)
-        else:
-            output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
-
-        if self.use_with_langchain:
-            if use_batch:
-                return [{"generated_text": unbatched_output_text} for unbatched_output_text in output_text]
-            else:
-                return [{"generated_text": output_text}]
-
-        return output_text
diff --git a/examples/text-generation/text-generation-pipeline/run_pipeline.py b/examples/text-generation/text-generation-pipeline/run_pipeline.py
deleted file mode 100644
index 43aea65cec..0000000000
--- a/examples/text-generation/text-generation-pipeline/run_pipeline.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import argparse
-import logging
-import math
-import time
-
-from pipeline import GaudiTextGenerationPipeline
-from run_generation import setup_parser
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    args = setup_parser(parser)
-    args.num_return_sequences = 1
-
-    if args.prompt:
-        input_sentences = args.prompt
-    else:
-        input_sentences = [
-            "DeepSpeed is a machine learning framework",
-            "He is working on",
-            "He has a",
-            "He got all",
-            "Everyone is happy and I can",
-            "The new movie that got Oscar this year",
-            "In the far far distance from our galaxy,",
-            "Peace is the only way",
-        ]
-
-    if args.batch_size > len(input_sentences):
-        times_to_extend = math.ceil(args.batch_size / len(input_sentences))
-        input_sentences = input_sentences * times_to_extend
-
-    input_sentences = input_sentences[: args.batch_size]
-
-    logger.info("Initializing text-generation pipeline...")
-    pipe = GaudiTextGenerationPipeline(args, logger)
-
-    duration = 0
-    for iteration in range(args.n_iterations):
-        logger.info(f"Running inference iteration {iteration+1}...")
-        t0 = time.perf_counter()
-        output = pipe(input_sentences)
-        duration += time.perf_counter() - t0
-
-        for i, (input_sentence, generated_text) in enumerate(zip(input_sentences, output)):
-            print(f"Prompt[{iteration+1}][{i+1}]: {input_sentence}")
-            print(f"Generated Text[{iteration+1}][{i+1}]: {repr(generated_text)}\n")
-
-    throughput = args.n_iterations * args.batch_size * args.max_new_tokens / duration
-    print(f"Inference Duration (for {args.n_iterations} iterations): {duration} seconds")
-    print(f"Throughput: {throughput} tokens/second")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py b/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py
deleted file mode 100644
index 5d19640c44..0000000000
--- a/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2024 Intel Corporation and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import math
-import time
-
-from langchain.chains import LLMChain
-from langchain.llms import HuggingFacePipeline
-from langchain.prompts import PromptTemplate
-from pipeline import GaudiTextGenerationPipeline
-from run_generation import setup_parser
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    args = setup_parser(parser)
-
-    # Initialize the pipeline
-    pipe = GaudiTextGenerationPipeline(args, logger, use_with_langchain=True, warmup_on_init=False)
-
-    # Create LangChain object
-    llm = HuggingFacePipeline(pipeline=pipe)
-
-    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
-    just say that you don't know, don't try to make up an answer.
-
-    Context: Large Language Models (LLMs) are the latest models used in NLP.
-    Their superior performance over smaller models has made them incredibly
-    useful for developers building NLP enabled applications. These models
-    can be accessed via Hugging Face's `transformers` library, via OpenAI
-    using the `openai` library, and via Cohere using the `cohere` library.
-
-    Question: {question}
-    Answer: """
-
-    prompt = PromptTemplate(input_variables=["question"], template=template)
-    llm_chain = LLMChain(prompt=prompt, llm=llm)
-
-    questions = [
-        {"question": "Which libraries and model providers offer LLMs?"},
-        {"question": "What is the provided context about?"},
-        {"question": "Can I use LLMs on CPU?"},
-        {"question": "How easy is to build my own LLM?"},
-        {"question": "Can I use LLM to order pizza?"},
-        {"question": "Can I install LLM into my phone?"},
-    ]
-
-    if args.batch_size > len(questions):
-        times_to_extend = math.ceil(args.batch_size / len(questions))
-        questions = questions * times_to_extend
-
-    input_questions = questions[: args.batch_size]
-
-    import habana_frameworks.torch.hpu as torch_hpu
-
-    logger.info("LangChain warmup (graph compilation)...")
-    for _ in range(args.warmup):
-        # Use LangChain object
-        _ = llm_chain.generate(input_questions)
-    torch_hpu.synchronize()
-
-    duration = 0
-    for iteration in range(args.n_iterations):
-        t0 = time.perf_counter()
-        # Use LangChain object
-        responses = llm_chain.generate(input_questions)
-        duration += time.perf_counter() - t0
-
-        for i, (question, answer) in enumerate(zip(input_questions, responses.generations)):
-            print(f"Question[{iteration+1}][{i+1}]: {question['question']}")
-            print(f"Response[{iteration+1}][{i+1}]: {repr(answer[0].text)}\n")
-
-    throughput = args.n_iterations * args.batch_size * args.max_new_tokens / duration
-    print(f"Inference Duration (for {args.n_iterations} iterations): {duration} seconds")
-    print(f"Throughput: {throughput} tokens/second")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
deleted file mode 100644
index fc0cad190e..0000000000
--- a/examples/text-generation/utils.py
+++ /dev/null
@@ -1,644 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-###############################################################################
-# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-import copy
-import glob
-import os
-import shutil
-import tempfile
-import time
-from pathlib import Path
-
-import torch
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-from transformers.utils import check_min_version
-
-from optimum.habana.checkpoint_utils import (
-    get_ds_injection_policy,
-    get_repo_root,
-    model_is_optimized,
-    model_on_meta,
-    write_checkpoints_json,
-)
-from optimum.habana.utils import (
-    check_habana_frameworks_version,
-    check_optimum_habana_min_version,
-    get_habana_frameworks_version,
-    set_seed,
-)
-
-
-def adjust_batch(batch, size):
-    curr_size = batch["input_ids"].shape[1]
-    if curr_size >= size:
-        adjusted_batch = {
-            "input_ids": batch["input_ids"][:, :size],
-            "attention_mask": batch["attention_mask"][:, :size],
-        }
-    else:
-        adjusted_batch = {}
-        for k in batch.keys():
-            last_colm = batch[k][:, -1]
-            expanded = last_colm.tile((size - curr_size, 1)).T
-            adjusted_batch[k] = torch.concat([batch[k], expanded], 1)
-    assert adjusted_batch["input_ids"].shape[1] == size
-    assert adjusted_batch["attention_mask"].shape[1] == size
-    return adjusted_batch
-
-
-def override_print(enable):
-    import builtins as __builtin__
-
-    builtin_print = __builtin__.print
-
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if force or enable:
-            builtin_print(*args, **kwargs)
-
-    __builtin__.print = print
-
-
-def override_logger(logger, enable):
-    logger_info = logger.info
-
-    def info(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if force or enable:
-            logger_info(*args, **kwargs)
-
-    logger.info = info
-
-
-def count_hpu_graphs():
-    return len(glob.glob(".graph_dumps/*PreGraph*"))
-
-
-def override_prints(enable, logger):
-    override_print(enable)
-    override_logger(logger, enable)
-
-
-def setup_distributed(args):
-    args.local_rank = int(os.getenv("LOCAL_RANK", "0"))
-    args.world_size = int(os.getenv("WORLD_SIZE", "0"))
-    args.global_rank = int(os.getenv("RANK", "0"))
-
-
-def setup_inference(args, model):
-    import habana_frameworks.torch.core as htcore
-
-    habana_version = get_habana_frameworks_version()
-
-    print("Initializing inference mode")
-    # Keeping the if-else here for back compat. TODO remove later
-    if habana_version.major >= 1 and habana_version.minor >= 16:
-        htcore.hpu_initialize(model, mark_only_scales_as_const=True)
-    else:
-        const_marking = os.getenv("ENABLE_CONST_MARKING", "True")
-        if const_marking == "True":
-            htcore.hpu_initialize(model)
-    return model
-
-
-def setup_const_serialization(const_serialization_path):
-    import uuid
-
-    const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex)
-    os.makedirs(const_serialization_path)
-    from habana_frameworks.torch.hpu import enable_const_section_serialization
-
-    print("Serializing const params to {}".format(const_serialization_path))
-    enable_const_section_serialization(const_serialization_path, True)
-
-
-def setup_env(args):
-    # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-    check_min_version("4.34.0")
-    check_optimum_habana_min_version("1.9.0.dev0")
-    # TODO: SW-167588 - WA for memory issue in hqt prep_model
-    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
-
-    if args.global_rank == 0 and not args.torch_compile:
-        os.environ.setdefault("GRAPH_VISUALIZATION", "true")
-        shutil.rmtree(".graph_dumps", ignore_errors=True)
-
-    if args.world_size > 0:
-        os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0")
-        os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true")
-
-    if args.use_hpu_graphs and args.limit_hpu_graphs and not args.reuse_cache and args.bucket_internal:
-        # Based upon above conditions and below env variable,
-        # we can call HPU graphs clear_inputs().
-        os.environ.setdefault("PT_HPUGRAPH_DISABLE_TENSOR_CACHE", "1")
-
-    # Tweak generation so that it runs faster on Gaudi
-    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-    adapt_transformers_to_gaudi()
-
-
-def setup_device(args):
-    if args.device == "hpu":
-        import habana_frameworks.torch.core as htcore
-
-        if args.quant_config:
-            htcore.hpu_set_env()
-    return torch.device(args.device)
-
-
-# patching LinearAllreduce to use ScopedLinearAllReduce
-def patch_scoped_linear_all_reduce(model):
-    from deepspeed.module_inject.layers import LinearAllreduce
-
-    from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce
-
-    for name, module in model.named_children():
-        if type(module) is LinearAllreduce:
-            SL = ScopedLinearAllReduce(mod=module)
-            setattr(model, name, SL)
-        patch_scoped_linear_all_reduce(module)
-
-
-def get_torch_compiled_model(model):
-    if  model.config.model_type in ['gpt_bigcode']:
-      # For gpt_bigcode model_type, model.transformer is used instead of model.model
-      model.transformer = torch.compile(model.transformer, backend="hpu_backend", options={"keep_input_mutations": True})
-    else:
-      model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True})
-    return model
-
-
-def setup_quantization(model, args):
-    if os.getenv("USE_INC", "1") != "0":
-        from neural_compressor.torch.quantization import FP8Config, convert, prepare
-
-        config = FP8Config.from_json_file(args.quant_config)
-        if config.measure:
-            model = prepare(model, config)
-        elif config.quantize:
-            model = convert(model, config)
-    else:
-        import habana_quantization_toolkit
-
-        habana_quantization_toolkit.prep_model(model)
-
-    return model
-
-
-def finalize_quantization(model):
-    if os.getenv("USE_INC", "1") != "0":
-        from neural_compressor.torch.quantization import finalize_calibration
-
-        finalize_calibration(model)
-    else:
-        import habana_quantization_toolkit
-
-        habana_quantization_toolkit.finish_measurements(model)
-
-
-def setup_model(args, model_dtype, model_kwargs, logger):
-    logger.info("Single-device run.")
-    if args.assistant_model is None:
-        assistant_model = None
-    else:
-        logger.info(f"Using asssitant model {args.assistant_model}.")
-    if args.disk_offload:
-        from accelerate import infer_auto_device_map, init_empty_weights
-
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
-        with init_empty_weights():
-            model = AutoModelForCausalLM.from_config(config)
-        max_memory = {"cpu": "10GiB"}
-        device_map = infer_auto_device_map(model, max_memory=max_memory, dtype=model_dtype)
-        model = AutoModelForCausalLM.from_pretrained(
-            args.model_name_or_path,
-            device_map=device_map,
-            offload_folder="/tmp/offload_folder/",
-            offload_state_dict=True,
-            torch_dtype=model_dtype,
-            **model_kwargs,
-        )
-    elif args.gptq:
-        from transformers import GPTQConfig
-
-        quantization_config = GPTQConfig(bits=4, use_exllama=False)
-        model = AutoModelForCausalLM.from_pretrained(
-            args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs
-        )
-    elif args.load_cp:
-        from neural_compressor.torch.quantization import load
-        model = load(
-            model_name_or_path=args.model_name_or_path,
-            format="huggingface",
-            device="hpu",
-            **model_kwargs
-        )
-    else:
-        if args.assistant_model is not None:
-            assistant_model = AutoModelForCausalLM.from_pretrained(
-                args.assistant_model, torch_dtype=model_dtype, **model_kwargs
-            )
-        if args.peft_model is not None:
-            model = peft_model(args, model_dtype, logger, **model_kwargs)
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
-            )
-    if args.quant_config:
-        model = setup_quantization(model, args)
-
-        if args.assistant_model is not None:
-            import habana_quantization_toolkit
-
-            habana_quantization_toolkit.quantize_model(assistant_model)
-
-    model = model.eval().to(args.device)
-    if args.assistant_model is not None:
-        assistant_model = assistant_model.eval().to(args.device)
-
-    if args.use_hpu_graphs:
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        from optimum.habana.transformers.trainer import _is_peft_model
-
-        if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
-            model = wrap_in_hpu_graph(model, hash_with_views=False)
-        else:
-            model = wrap_in_hpu_graph(model)
-        if args.assistant_model is not None:
-            assistant_model = wrap_in_hpu_graph(assistant_model)
-        if _is_peft_model(model):
-            model.base_model = wrap_in_hpu_graph(model.base_model)
-
-    if args.torch_compile:
-        model = get_torch_compiled_model(model)
-        # if args.assistant_model is not None:
-        #     assistant_model = get_torch_compiled_model(assistant_model)
-    return model, assistant_model
-
-
-def setup_distributed_model_tp(args, model_dtype, model_kwargs, logger, cache_dir):
-    from typing import Any, MutableMapping
-
-    from optimum.habana.distributed import serialization
-    from optimum.habana.distributed.strategy import TensorParallelStrategy
-
-    logger.info("Multi-device run.")
-
-    assert args.quant_config == "", "Fp8 is not enabled, unset QUANT_CONFIG"
-    assert args.assistant_model is None, "Assistant model must be None"
-
-    from torch import distributed as dist
-
-    if args.device == "hpu":
-        dist.init_process_group(backend="hccl")
-    else:
-        assert False, "Supports TP only on HPU"
-
-    torch._C._distributed_c10d._register_process_group("default", dist.group.WORLD)
-    logger.info("Creating Model")
-    config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
-    model_kwargs = {}
-    model_kwargs["parallel_strategy"] = TensorParallelStrategy()
-    model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype, **model_kwargs)
-
-    initial_device = torch.device("cpu")
-    source = "hf"
-    checkpoint_sharding = None
-    lazy_sd: MutableMapping[str, Any] = {}
-    logger.info("Loading Checkpoints")
-    lazy_sd = serialization.load_state_dict(
-        cache_dir,
-        source=source,
-        distributed_strategy=args.parallel_strategy,
-        checkpoint_sharding=None,
-        initial_device=initial_device,
-        rank=args.global_rank,
-        world_size=args.world_size,
-    )
-    architecture = "llama"
-    if len(lazy_sd):
-        serialization.load_state_dict_into_model(
-            model,
-            lazy_sd,
-            architecture,
-            source,
-            args.parallel_strategy,
-            checkpoint_sharding,
-            initial_device,
-            args.local_rank,
-            args.world_size,
-        )
-
-    model = model.eval().to(args.device)
-
-    if args.use_hpu_graphs:
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        model = wrap_in_hpu_graph(model)
-
-    if args.torch_compile and model.config.model_type == "llama":
-        model = get_torch_compiled_model(model)
-
-    return model, args.assistant_model
-
-
-def setup_distributed_model(args, model_dtype, model_kwargs, logger):
-    import deepspeed
-
-    logger.info("DeepSpeed is enabled.")
-    deepspeed.init_distributed(dist_backend="hccl")
-    config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
-    load_to_meta = model_on_meta(config)
-
-    if args.assistant_model is None:
-        assistant_model = None
-    else:
-        logger.info(f"Using asssitant model {args.assistant_model}.")
-
-    if load_to_meta:
-        # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
-        with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
-            model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
-
-        # Model loaded to meta is managed differently
-        checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
-
-        # For PEFT models, write the merged model on disk to be able to load it on the meta device
-        if args.peft_model is not None:
-            merged_model_dir = "/tmp/text_generation_merged_peft_model"
-            if args.local_rank == 0:
-                if Path(merged_model_dir).is_dir():
-                    shutil.rmtree(merged_model_dir)
-                peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir)
-            torch.distributed.barrier()
-
-        write_checkpoints_json(
-            merged_model_dir if args.peft_model is not None else args.model_name_or_path,
-            args.local_rank,
-            checkpoints_json,
-            token=args.token,
-        )
-    else:
-        # TODO: revisit placement on CPU when auto-injection is possible
-        with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
-            if args.peft_model is not None:
-                model = peft_model(args, model_dtype, logger, **model_kwargs)
-            else:
-                model = AutoModelForCausalLM.from_pretrained(
-                    args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
-                )
-    model.eval()
-
-    if args.assistant_model is not None:
-        assistant_model = AutoModelForCausalLM.from_pretrained(
-            args.assistant_model, torch_dtype=model_dtype, **model_kwargs
-        ).eval()
-
-    # Initialize the model
-    ds_inference_kwargs = {"dtype": model_dtype}
-    ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size}
-    ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs
-    ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)
-    if load_to_meta:
-        ds_inference_kwargs["checkpoint"] = checkpoints_json.name
-
-    model = deepspeed.init_inference(model, **ds_inference_kwargs)
-    model = model.module
-    if model.config.model_type in ["llama", "falcon", "qwen2"]:
-        patch_scoped_linear_all_reduce(model)
-
-    if args.quant_config:
-        model = setup_quantization(model, args)
-        if args.assistant_model is not None:
-            import habana_quantization_toolkit
-
-            habana_quantization_toolkit.prep_model(assistant_model)
-
-    if args.torch_compile:
-        model = get_torch_compiled_model(model)
-        # if args.assistant_model is not None:
-        #     assistant_model = get_torch_compiled_model(assistant_model)
-    return model, assistant_model
-
-
-def peft_model(args, model_dtype, logger, **model_kwargs):
-    import importlib.util
-
-    if importlib.util.find_spec("peft") is None:
-        raise ImportError("The `peft` package is not installed, please run: `pip install peft`.")
-    from peft import AutoPeftModelForCausalLM
-    from peft.config import PeftConfigMixin
-
-    base_model_name = PeftConfigMixin.from_pretrained(
-        args.peft_model,
-        token=model_kwargs["token"] if "token" in model_kwargs else None,
-    ).base_model_name_or_path
-
-    base_model_is_local = Path(base_model_name).is_dir()
-    if not base_model_is_local:
-        # Check if the base model path to a remote repository on the HF Hub exists
-        from huggingface_hub import list_repo_files
-
-        try:
-            list_repo_files(base_model_name)
-            base_model_is_remote = True
-        except Exception:
-            base_model_is_remote = False
-
-    if base_model_is_local or base_model_is_remote:
-        model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs)
-    else:
-        # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
-        logger.warning(
-            f"The base model `{base_model_name}` of the LoRA configuration associated"
-            f" to `{args.peft_model}` does not exist locally or remotely. Using "
-            f"`--model_name_or_path {args.model_name_or_path}` as a fall back for the base model."
-        )
-        from peft import PeftModel
-
-        model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
-        model = PeftModel.from_pretrained(model, args.peft_model, torch_dtype=model_dtype, **model_kwargs)
-    if hasattr(model, "merge_and_unload"):
-        model = model.merge_and_unload()
-        if model_dtype == torch.bfloat16:
-            model = model.to(torch.bfloat16)
-        return model
-    else:
-        from optimum.habana.peft.peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation
-
-        model.__class__.generate = gaudi_generate
-        model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation
-        return model
-
-
-def setup_tokenizer(args, model, assistant_model):
-    tokenizer_kwargs = {
-        "revision": args.model_revision,
-        "token": args.token,
-        "trust_remote_code": args.trust_remote_code,
-    }
-    if args.bad_words is not None or args.force_words is not None:
-        tokenizer_kwargs["add_prefix_space"] = True
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs)
-    if not model.config.is_encoder_decoder:
-        tokenizer.padding_side = "left"
-
-    if model.config.model_type == "llama":
-        # unwind broken decapoda-research config
-        model.generation_config.pad_token_id = 0
-        model.generation_config.bos_token_id = 1
-        model.generation_config.eos_token_id = 2
-        if assistant_model is not None:
-            assistant_model.generation_config.pad_token_id = 0
-            assistant_model.generation_config.bos_token_id = 1
-            assistant_model.generation_config.eos_token_id = 2
-        tokenizer.bos_token_id = model.generation_config.bos_token_id
-        tokenizer.eos_token_id = model.generation_config.eos_token_id
-        tokenizer.pad_token_id = model.generation_config.pad_token_id
-        tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
-        tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
-        tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
-    if model.config.model_type == "persimmon":
-        model.generation_config.pad_token_id = model.generation_config.eos_token_id
-        if assistant_model is not None:
-            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
-        tokenizer.bos_token_id = model.generation_config.bos_token_id
-        tokenizer.eos_token_id = model.generation_config.eos_token_id
-        tokenizer.pad_token_id = model.generation_config.pad_token_id
-        tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
-        tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
-        tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
-
-    # Some models like GPT2 do not have a PAD token so we have to set it if necessary
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-        model.generation_config.pad_token_id = model.generation_config.eos_token_id
-        if assistant_model is not None:
-            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
-
-    return tokenizer, model, assistant_model
-
-
-def setup_generation_config(args, model, assistant_model, tokenizer):
-    bad_words_ids = None
-    force_words_ids = None
-    if args.bad_words is not None:
-        bad_words_ids = [tokenizer.encode(bad_word, add_special_tokens=False) for bad_word in args.bad_words]
-    if args.force_words is not None:
-        force_words_ids = [tokenizer.encode(force_word, add_special_tokens=False) for force_word in args.force_words]
-
-    is_optimized = model_is_optimized(model.config)
-
-    # Generation configuration
-    generation_config = copy.deepcopy(model.generation_config)
-    generation_config.max_new_tokens = args.max_new_tokens
-    generation_config.use_cache = args.use_kv_cache
-    generation_config.static_shapes = is_optimized and assistant_model is None
-    generation_config.bucket_size = args.bucket_size if is_optimized else -1
-    generation_config.bucket_internal = args.bucket_internal
-    generation_config.do_sample = args.do_sample
-    generation_config.num_beams = args.num_beams
-    generation_config.bad_words_ids = bad_words_ids
-    generation_config.force_words_ids = force_words_ids
-    generation_config.num_return_sequences = args.num_return_sequences
-    generation_config.trim_logits = args.trim_logits
-    generation_config.attn_softmax_bf16 = args.attn_softmax_bf16
-    generation_config.limit_hpu_graphs = args.limit_hpu_graphs
-    generation_config.reuse_cache = args.reuse_cache
-    generation_config.reduce_recompile = args.reduce_recompile
-    if generation_config.reduce_recompile:
-        assert generation_config.bucket_size > 0
-    generation_config.use_flash_attention = args.use_flash_attention
-    generation_config.flash_attention_recompute = args.flash_attention_recompute
-    generation_config.flash_attention_causal_mask = args.flash_attention_causal_mask
-    generation_config.flash_attention_fast_softmax = args.flash_attention_fast_softmax
-    generation_config.trust_remote_code = args.trust_remote_code
-    setattr(generation_config, 'valid_sequence_lengths', None)
-
-    return generation_config
-
-
-def exclude_hpu_graph_configs(args):
-    # Excluded configs for batch size 1 for hpu graph
-    if args.batch_size == 1 and args.limit_hpu_graphs:
-        if "falcon-180B" in args.model_name_or_path or "falcon-180b" in args.model_name_or_path:
-            return False
-        if args.world_size == 2 or args.world_size == 4 or args.world_size == 8:
-            if args.quant_config:
-                if args.max_input_tokens >= 8192 and args.max_new_tokens >= 128:
-                    return False
-            else:
-                if args.max_input_tokens >= 4096 and args.max_new_tokens >= 128:
-                    return False
-        return True
-    else:
-        return False
-
-
-def initialize_model(args, logger):
-    init_start = time.perf_counter()
-    setup_distributed(args)
-    if exclude_hpu_graph_configs(args):
-        args.limit_hpu_graphs = False
-    override_prints(args.global_rank == 0 or args.verbose_workers, logger)
-    setup_env(args)
-    setup_device(args)
-    set_seed(args.seed)
-    cache_dir = get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=args.token)
-    if args.assistant_model is not None:
-        get_repo_root(args.assistant_model, local_rank=args.local_rank, token=args.token)
-    use_deepspeed = args.world_size > 0
-    if use_deepspeed or args.bf16:
-        model_dtype = torch.bfloat16
-    else:
-        model_dtype = torch.float
-        args.attn_softmax_bf16 = False
-
-    model_kwargs = {
-        "revision": args.model_revision,
-        "token": args.token,
-        "trust_remote_code": args.trust_remote_code,
-    }
-    if args.load_cp:
-        model_kwargs["torch_dtype"] = torch.bfloat16
-
-    if args.trust_remote_code:
-        logger.warning("`trust_remote_code` is set, there is no guarantee this model works properly and it may fail")
-
-    model, assistant_model = (
-        setup_model(args, model_dtype, model_kwargs, logger)
-        if not use_deepspeed
-        else setup_distributed_model(args, model_dtype, model_kwargs, logger)
-        if not args.parallel_strategy == "tp"
-        else setup_distributed_model_tp(args, model_dtype, model_kwargs, logger, cache_dir)
-    )
-    tokenizer, model, assistant_model = setup_tokenizer(args, model, assistant_model)
-    generation_config = setup_generation_config(args, model, assistant_model, tokenizer)
-
-    if args.const_serialization_path:
-        setup_const_serialization(args.const_serialization_path)
-    if args.quant_config:
-        model = setup_inference(args, model)
-    init_end = time.perf_counter()
-    logger.info(f"Args: {args}")
-    logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
-    logger.info(f"Model initialization took {(init_end - init_start):.3f}s")
-    return model, assistant_model, tokenizer, generation_config
diff --git a/examples/text-to-speech/README.md b/examples/text-to-speech/README.md
deleted file mode 100644
index a1e089f55e..0000000000
--- a/examples/text-to-speech/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Text to Speech Examples
-
-This directory contains a script that showcases how to use the Transformers pipeline API to run text to speech task on HPUs.
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Single-HPU inference
-
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path microsoft/speecht5_tts \
-    --text "Hello, my dog is cooler than you!" \
-    --use_hpu_graphs \
-    --bf16
-```
-Models that have been validated:
-  - [microsoft/speecht5_tts](https://huggingface.co/microsoft/speecht5_tts)
-  - [facebook/hf-seamless-m4t-medium](https://huggingface.co/facebook/hf-seamless-m4t-medium)
-  - [facebook/mms-tts-eng](https://huggingface.co/facebook/mms-tts-eng)
diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt
deleted file mode 100644
index 96ced8c4bb..0000000000
--- a/examples/text-to-speech/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-datasets == 2.19.2
-soundfile == 0.12.1
diff --git a/examples/text-to-speech/run_pipeline.py b/examples/text-to-speech/run_pipeline.py
deleted file mode 100644
index 1d9b53de7d..0000000000
--- a/examples/text-to-speech/run_pipeline.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import argparse
-import logging
-import time
-
-import soundfile as sf
-import torch
-from datasets import load_dataset
-from transformers import pipeline
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-from optimum.habana.utils import set_seed
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        help="Path to pre-trained model",
-    )
-    parser.add_argument(
-        "--text",
-        default=None,
-        type=str,
-        nargs="*",
-        help='Text input. Can be a single string (eg: --text "text1"), or a list of space-separated strings (eg: --text "text1" "text2")',
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Whether to perform generation in bf16 precision.",
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
-    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
-    parser.add_argument("--seed", type=int, default=555, help="make speech generation deterministic")
-    parser.add_argument(
-        "--use_hpu_graphs",
-        action="store_true",
-        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
-    )
-    args = parser.parse_args()
-
-    adapt_transformers_to_gaudi()
-    text = args.text
-    text_bs = len(text)
-    set_seed(args.seed)
-
-    if args.batch_size > text_bs:
-        # Dynamically extends to support larger batch sizes
-        text_to_add = args.batch_size - text_bs
-        for i in range(text_to_add):
-            text.append(text[i % text_bs])
-    elif args.batch_size < text_bs:
-        text = text[: args.batch_size]
-
-    if args.bf16:
-        model_dtype = torch.bfloat16
-    else:
-        model_dtype = torch.float32
-
-    generator = pipeline(
-        "text-to-speech",
-        model=args.model_name_or_path,
-        torch_dtype=model_dtype,
-        device="hpu",
-    )
-
-    if args.use_hpu_graphs:
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        generator.model = wrap_in_hpu_graph(generator.model)
-
-    forward_params = None
-    if generator.model.config.model_type == "speecht5":
-        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-        speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to("hpu")
-        forward_params = {"speaker_embeddings": speaker_embedding}
-    if generator.model.config.model_type == "seamless_m4t":
-        forward_params = {"tgt_lang": "eng"}
-
-    generate_kwargs = None
-    if generator.model.can_generate():
-        generate_kwargs = {"lazy_mode": True, "ignore_eos": False, "hpu_graphs": args.use_hpu_graphs}
-
-    with torch.autocast("hpu", torch.bfloat16, enabled=args.bf16), torch.inference_mode():
-        # warm up
-        for i in range(args.warmup):
-            if generator.model.config.model_type == "speecht5":
-                # SpeechT5 forces a dropout with training=True, which may zero out some elements randomly.
-                # A random dropout may need different lengths of spectrograms to fit probability thresholds,
-                # which violates the HPU static shape, so we have to fix the seed here.
-                set_seed(args.seed)
-            generator(text, batch_size=args.batch_size, forward_params=forward_params, generate_kwargs=generate_kwargs)
-
-        start = time.time()
-        for i in range(args.n_iterations):
-            if generator.model.config.model_type == "speecht5":
-                # SpeechT5 forces a dropout with training=True, which may zero out some elements randomly.
-                # A random dropout may need different lengths of spectrograms to fit probability thresholds,
-                # which violates the HPU static shape, so we have to fix the seed here.
-                set_seed(args.seed)
-            speech = generator(
-                text, batch_size=args.batch_size, forward_params=forward_params, generate_kwargs=generate_kwargs
-            )
-        end = time.time()
-        logger.info(f"speech = {speech} time = {(end-start) * 1000 / args.n_iterations }ms")
-        sf.write("speech.wav", speech[0]["audio"].squeeze(), samplerate=speech[0]["sampling_rate"])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/translation/README.md b/examples/translation/README.md
deleted file mode 100644
index 1d705d23fc..0000000000
--- a/examples/translation/README.md
+++ /dev/null
@@ -1,243 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Translation Examples
-
-`run_translation.py` is a lightweight example of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
-
-For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files.
-You will also find examples of these below.
-
-## Requirements
-
-First, you should install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-## Single-card Training
-
-Here is an example of a translation fine-tuning with a T5 model.
-T5 models `t5-small`, `t5-base`, `t5-large`, `t5-3b` and `t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For instance:
-
-```bash
-python run_translation.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --source_lang en \
-    --target_lang ro \
-    --source_prefix "translate English to Romanian: " \
-    --dataset_name wmt16 \
-    --dataset_config_name ro-en \
-    --output_dir /tmp/tst-translation \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --save_strategy epoch \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-If you get a terrible BLEU score, make sure that you didn't forget to use the `--source_prefix` argument.
-
-For the aforementioned group of T5 models, it's important to remember that if you switch to a different language pair, make sure to adjust the source and target values in all 3 language-specific command line arguments: `--source_lang`, `--target_lang` and `--source_prefix`.
-
-In lazy mode, make sure to use the arguments `--pad_to_max_length` and `--ignore_pad_token_for_loss False` to pad batches to max length and to avoid negative pad tokens.
-
-And here is how you would use the translation finetuning on your own files, after adjusting the
-values for the arguments `--train_file`, `--validation_file` to match your setup:
-
-```bash
-python run_translation.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --source_lang en \
-    --target_lang ro \
-    --source_prefix "translate English to Romanian: " \
-    --dataset_name wmt16 \
-    --dataset_config_name ro-en \
-    --train_file path_to_jsonlines_file \
-    --validation_file path_to_jsonlines_file \
-    --output_dir /tmp/tst-translation \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-The task of translation supports only custom JSONLINES files, with each line being a dictionary with the key `"translation"` and its value another dictionary whose keys is the language pair. For example:
-
-```json
-{ "translation": { "en": "Others have dismissed him as a joke.", "ro": "Alții l-au numit o glumă." } }
-{ "translation": { "en": "And some are holding out for an implosion.", "ro": "Iar alții așteaptă implozia." } }
-```
-Here the languages are Romanian (`ro`) and English (`en`).
-
-If you want to use a pre-processed dataset that leads to high BLEU scores, but for the `en-de` language pair, you can use `--dataset_name stas/wmt14-en-de-pre-processed`, as follows:
-
-```bash
-python run_translation.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --source_lang en \
-    --target_lang de \
-    --source_prefix "translate English to German: " \
-    --dataset_name stas/wmt14-en-de-pre-processed \
-    --output_dir /tmp/tst-translation \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --throughput_warmup_steps 3 \
-    --bf16
- ```
-
-
- ## Multi-card Training
-
- Here is an example of distributing training on 8 HPUs:
-
- ```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_mpi run_translation.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --source_lang en \
-    --target_lang ro \
-    --source_prefix '"translate English to Romanian: "' \
-    --dataset_name wmt16 \
-    --dataset_config_name ro-en \
-    --output_dir /tmp/tst-translation \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --save_strategy epoch \
-    --throughput_warmup_steps 3 \
-    --bf16
-```
-
-
-## Using DeepSpeed
-
- Here is an example with DeepSpeed on 8 HPUs:
-
- ```bash
-python ../gaudi_spawn.py \
-    --world_size 8 --use_deepspeed run_translation.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --source_lang en \
-    --target_lang ro \
-    --source_prefix '"translate English to Romanian: "' \
-    --dataset_name wmt16 \
-    --dataset_config_name ro-en \
-    --output_dir /tmp/tst-translation \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --save_strategy epoch \
-    --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
-```
-
-You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
-Here is a DeepSpeed configuration you can use to train your models on Gaudi:
-```json
-{
-    "steps_per_print": 64,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
-```
-
-
-## Inference
-
-To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc...
-
-For instance, you can run inference with BERT on GLUE on 1 Gaudi card with the following command:
-```bash
-python run_translation.py \
-    --model_name_or_path t5-small \
-    --do_eval \
-    --source_lang en \
-    --target_lang ro \
-    --source_prefix "translate English to Romanian: " \
-    --dataset_name wmt16 \
-    --dataset_config_name ro-en \
-    --output_dir /tmp/tst-translation \
-    --per_device_eval_batch_size 4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --use_habana \
-    --use_lazy_mode \
-    --use_hpu_graphs_for_inference \
-    --gaudi_config_name Habana/t5 \
-    --ignore_pad_token_for_loss False \
-    --pad_to_max_length \
-    --bf16
-```
diff --git a/examples/translation/requirements.txt b/examples/translation/requirements.txt
deleted file mode 100644
index fb13645e78..0000000000
--- a/examples/translation/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-datasets >= 2.4.0, <= 2.19.2
-sentencepiece != 0.1.92
-protobuf == 3.20.3
-sacrebleu >= 1.4.12, <= 2.4.2
-py7zr == 0.21.0
-torch >= 1.3, <= 2.4.0
-evaluate == 0.4.2
diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py
deleted file mode 100644
index db40ef8f28..0000000000
--- a/examples/translation/run_translation.py
+++ /dev/null
@@ -1,727 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for sequence to sequence.
-"""
-# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import datasets
-import evaluate
-import numpy as np
-import transformers
-from datasets import load_dataset
-from transformers import (
-    AutoConfig,
-    AutoModelForSeq2SeqLM,
-    AutoTokenizer,
-    DataCollatorForSeq2Seq,
-    HfArgumentParser,
-    M2M100Tokenizer,
-    MBart50Tokenizer,
-    MBart50TokenizerFast,
-    MBartTokenizer,
-    MBartTokenizerFast,
-    NllbTokenizerFast,
-    default_data_collator,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
-from transformers.utils.versions import require_version
-
-from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainer, GaudiSeq2SeqTrainingArguments
-from optimum.habana.utils import set_seed
-
-
-try:
-    from optimum.habana.utils import check_optimum_habana_min_version
-except ImportError:
-
-    def check_optimum_habana_min_version(*a, **b):
-        return ()
-
-
-logger = logging.getLogger(__name__)
-
-# Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.40.0")
-check_optimum_habana_min_version("1.11.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
-
-# A list of all multilingual tokenizer which require src_lang and tgt_lang attributes.
-MULTILINGUAL_TOKENIZERS = [
-    MBartTokenizer,
-    MBartTokenizerFast,
-    MBart50Tokenizer,
-    MBart50TokenizerFast,
-    M2M100Tokenizer,
-    NllbTokenizerFast,
-]
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    use_cache: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether or not the model should return the last key/values attentions (not used by all models)."
-                "Only relevant if `config.is_decoder=True`."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    source_lang: str = field(default=None, metadata={"help": "Source language id for translation."})
-    target_lang: str = field(default=None, metadata={"help": "Target language id for translation."})
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a jsonlines)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input evaluation data file to evaluate the metrics (sacrebleu) on a jsonlines file."
-        },
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input test data file to evaluate the metrics (sacrebleu) on a jsonlines file."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_source_length: Optional[int] = field(
-        default=1024,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_target_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total sequence length for target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    val_max_target_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. "
-                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
-                "during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to model maximum sentence length. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-                "efficient on GPU but very bad for HPU in lazy mode."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    num_beams: Optional[int] = field(
-        default=1,
-        metadata={
-            "help": (
-                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
-                "which is used during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    ignore_pad_token_for_loss: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
-        },
-    )
-    source_prefix: Optional[str] = field(
-        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
-    )
-    forced_bos_token: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
-                " multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
-                " be the target language token.(Usually it is the target language token)"
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        elif self.source_lang is None or self.target_lang is None:
-            raise ValueError("Need to specify the source language and the target language.")
-
-        # accepting both json and jsonl file extensions, as
-        # many jsonlines files actually have a .json extension
-        valid_extensions = ["json", "jsonl"]
-
-        if self.train_file is not None:
-            extension = self.train_file.split(".")[-1]
-            assert extension in valid_extensions, "`train_file` should be a jsonlines file."
-        if self.validation_file is not None:
-            extension = self.validation_file.split(".")[-1]
-            assert extension in valid_extensions, "`validation_file` should be a jsonlines file."
-        if self.val_max_target_length is None:
-            self.val_max_target_length = self.max_target_length
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiSeq2SeqTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
-    # information sent is the one passed as arguments along with your Python/PyTorch versions.
-    send_example_telemetry("run_translation", model_args, data_args)
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    if training_args.should_log:
-        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-        transformers.utils.logging.set_verbosity_info()
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    gaudi_config = GaudiConfig.from_pretrained(
-        training_args.gaudi_config_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-    )
-
-    # Log on each process the small summary:
-    mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
-        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
-        + f"mixed-precision training: {mixed_precision}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    if data_args.source_prefix is None and model_args.model_name_or_path in [
-        "google-t5/t5-small",
-        "google-t5/t5-base",
-        "google-t5/t5-large",
-        "google-t5/t5-3b",
-        "google-t5/t5-11b",
-    ]:
-        logger.warning(
-            "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with "
-            "`--source_prefix 'translate English to German: ' `"
-        )
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For translation, only JSON files are supported, with one field named "translation" containing two keys for the
-    # source and target languages (unless you adapt what follows).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        if extension == "jsonl":
-            builder_name = "json"  # the "json" builder reads both .json and .jsonl files
-        else:
-            builder_name = extension  # e.g. "parquet"
-        raw_datasets = load_dataset(
-            builder_name,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-        use_cache=False if training_args.gradient_checkpointing else model_args.use_cache,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-    model = AutoModelForSeq2SeqLM.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
-    )
-
-    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
-    # on a small vocab and want a smaller embedding size, remove this test.
-    embedding_size = model.get_input_embeddings().weight.shape[0]
-    if len(tokenizer) > embedding_size:
-        model.resize_token_embeddings(len(tokenizer))
-
-    # Set decoder_start_token_id
-    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        if isinstance(tokenizer, MBartTokenizer):
-            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
-        else:
-            model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.target_lang)
-
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-
-    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    if training_args.do_train:
-        column_names = raw_datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = raw_datasets["validation"].column_names
-    elif training_args.do_predict:
-        column_names = raw_datasets["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-
-    # For translation we set the codes of our source and target languages (only useful for mBART, the others will
-    # ignore those attributes).
-    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
-        assert data_args.target_lang is not None and data_args.source_lang is not None, (
-            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and "
-            "--target_lang arguments."
-        )
-
-        tokenizer.src_lang = data_args.source_lang
-        tokenizer.tgt_lang = data_args.target_lang
-
-        # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
-        # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
-        forced_bos_token_id = (
-            tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None
-        )
-        model.config.forced_bos_token_id = forced_bos_token_id
-
-    # Get the language codes for input/target.
-    source_lang = data_args.source_lang.split("_")[0]
-    target_lang = data_args.target_lang.split("_")[0]
-
-    # Check whether the source target length fits in the model, if it has absolute positional embeddings
-    if (
-        hasattr(model.config, "max_position_embeddings")
-        and not hasattr(model.config, "relative_attention_max_distance")
-        and model.config.max_position_embeddings < data_args.max_source_length
-    ):
-        raise ValueError(
-            f"`--max_source_length` is set to {data_args.max_source_length}, but the model only has"
-            f" {model.config.max_position_embeddings} position encodings. Consider either reducing"
-            f" `--max_source_length` to {model.config.max_position_embeddings} or using a model with larger position "
-            "embeddings"
-        )
-
-    # Temporarily set max_target_length for training.
-    max_target_length = data_args.max_target_length
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
-            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
-        )
-
-    def preprocess_function(examples):
-        inputs = [ex[source_lang] for ex in examples["translation"]]
-        targets = [ex[target_lang] for ex in examples["translation"]]
-        inputs = [prefix + inp for inp in inputs]
-        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
-
-        # Tokenize targets with the `text_target` keyword argument
-        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-        model_inputs["labels"] = labels["input_ids"]
-        return model_inputs
-
-    if training_args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = raw_datasets["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-        with training_args.main_process_first(desc="train dataset map pre-processing"):
-            train_dataset = train_dataset.map(
-                preprocess_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on train dataset",
-            )
-
-    if training_args.do_eval:
-        max_target_length = data_args.val_max_target_length
-        if "validation" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = raw_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-        with training_args.main_process_first(desc="validation dataset map pre-processing"):
-            eval_dataset = eval_dataset.map(
-                preprocess_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on validation dataset",
-            )
-
-    if training_args.do_predict:
-        max_target_length = data_args.val_max_target_length
-        if "test" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = raw_datasets["test"]
-        if data_args.max_predict_samples is not None:
-            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
-            predict_dataset = predict_dataset.select(range(max_predict_samples))
-        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
-            predict_dataset = predict_dataset.map(
-                preprocess_function,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on prediction dataset",
-            )
-
-    # Data collator
-    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
-    if data_args.pad_to_max_length:
-        data_collator = default_data_collator
-    else:
-        data_collator = DataCollatorForSeq2Seq(
-            tokenizer,
-            model=model,
-            label_pad_token_id=label_pad_token_id,
-            pad_to_multiple_of=8 if training_args.fp16 else None,
-        )
-
-    # Metric
-    metric = evaluate.load("sacrebleu", cache_dir=model_args.cache_dir)
-
-    def postprocess_text(preds, labels):
-        preds = [pred.strip() for pred in preds]
-        labels = [[label.strip()] for label in labels]
-
-        return preds, labels
-
-    def compute_metrics(eval_preds):
-        preds, labels = eval_preds
-        if isinstance(preds, tuple):
-            preds = preds[0]
-        # Replace -100s used for padding as we can't decode them
-        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
-        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-        # Some simple post-processing
-        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-
-        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
-        result = {"bleu": result["score"]}
-
-        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
-        result["gen_len"] = np.mean(prediction_lens)
-        result = {k: round(v, 4) for k, v in result.items()}
-        return result
-
-    # Initialize our Trainer
-    trainer = GaudiSeq2SeqTrainer(
-        model=model,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    max_length = (
-        training_args.generation_max_length
-        if training_args.generation_max_length is not None
-        else data_args.val_max_target_length
-    )
-    num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval")
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-
-        predict_results = trainer.predict(
-            predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams
-        )
-        metrics = predict_results.metrics
-        max_predict_samples = (
-            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-        if trainer.is_world_process_zero():
-            if training_args.predict_with_generate:
-                predictions = predict_results.predictions
-                predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
-                predictions = tokenizer.batch_decode(
-                    predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
-                )
-                predictions = [pred.strip() for pred in predictions]
-                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
-                with open(output_prediction_file, "w", encoding="utf-8") as writer:
-                    writer.write("\n".join(predictions))
-
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None]
-    if len(languages) > 0:
-        kwargs["language"] = languages
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/trl/README.md b/examples/trl/README.md
deleted file mode 100644
index e6a4f0b006..0000000000
--- a/examples/trl/README.md
+++ /dev/null
@@ -1,268 +0,0 @@
-# Examples
-
-
-## Requirements
-
-First, you should install the requirements:
-```
-$ pip install -U -r requirements.txt
-```
-
-## DPO pipeline
-
-### Training
-
-The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model.
-There are two main steps to the DPO training process:
-1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
-
-    ```
-    python ../gaudi_spawn.py --world_size 8 --use_mpi sft.py \
-        --model_name_or_path meta-llama/Llama-2-7b-hf \
-        --output_dir="./sft" \
-        --max_steps=500 \
-        --logging_steps=10 \
-        --save_steps=100 \
-        --per_device_train_batch_size=4 \
-        --per_device_eval_batch_size=1 \
-        --gradient_accumulation_steps=2 \
-        --learning_rate=1e-4 \
-        --lr_scheduler_type="cosine" \
-        --warmup_steps=100 \
-        --weight_decay=0.05 \
-        --optim="paged_adamw_32bit" \
-        --lora_target_modules "q_proj" "v_proj" \
-        --bf16 \
-        --remove_unused_columns=False \
-        --run_name="sft_llama2" \
-        --report_to=none \
-        --use_habana \
-        --use_lazy_mode
-    ```
-    To merge the adaptors to get the final sft merged checkpoint, we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-    ```
-    python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="sft" --output_name="sft/final_merged_checkpoint"
-    ```
-
-2. Run the DPO trainer using the model saved by the previous step:
-    ```
-    python ../gaudi_spawn.py --world_size 8 --use_mpi dpo.py \
-        --model_name_or_path="sft/final_merged_checkpoint" \
-        --tokenizer_name_or_path=meta-llama/Llama-2-7b-hf \
-        --lora_target_modules "q_proj" "v_proj" "k_proj" "out_proj" "fc_in" "fc_out" "wte" \
-        --output_dir="dpo" \
-        --report_to=none
-    ```
-For large model like Llama2-70B, we could use DeepSpeed Zero-3 to enable DPO training in multi-card.
-steps like:
-1. Supervised fine-tuning of the base llama-v2-70b model to create llama-v2-70b-se:
-
-    ```
-    DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py \
-        --model_name_or_path meta-llama/Llama-2-70b-hf \
-        --deepspeed ../language-modeling/llama2_ds_zero3_config.json \
-        --output_dir="./sft" \
-        --max_steps=500 \
-        --logging_steps=10 \
-        --save_steps=100 \
-        --per_device_train_batch_size=1 \
-        --per_device_eval_batch_size=1 \
-        --gradient_accumulation_steps=2 \
-        --learning_rate=1e-4 \
-        --lr_scheduler_type="cosine" \
-        --warmup_steps=100 \
-        --weight_decay=0.05 \
-        --optim="paged_adamw_32bit" \
-        --lora_target_modules "q_proj" "v_proj" \
-        --bf16 \
-        --remove_unused_columns=False \
-        --run_name="sft_llama2" \
-        --report_to=none \
-        --use_habana \
-        --use_lazy_mode
-    ```
-    To merge the adaptors to get the final sft merged checkpoint, we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-    ```
-    python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-70b-hf" --adapter_model_name="sft" --output_name="sft/final_merged_checkpoint"
-    ```
-
-2. Run the DPO trainer using the model saved by the previous step:
-    ```
-    DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed dpo.py \
-        --model_name_or_path="sft/final_merged_checkpoint" \
-        --tokenizer_name_or_path=meta-llama/Llama-2-70b-hf \
-        --deepspeed ../language-modeling/llama2_ds_zero3_config.json \
-        --lora_target_modules "q_proj" "v_proj" "k_proj" "out_proj" "fc_in" "fc_out" "wte" \
-        --output_dir="dpo" \
-        --max_prompt_length=256 \
-        --max_length=512 \
-        --report_to=none
-    ```
-
-
-### Merging the adaptors
-
-To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-
-```
-python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="dpo" --output_name="stack-llama-2"
-```
-
-which will also push the model to your HuggingFace hub account.
-
-### Running the model
-
-We can load the DPO-trained LoRA adaptors which were saved by the DPO training step and run it through the [text-generation example](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation).
-
-```
-python run_generation.py \
---model_name_or_path ../trl/stack-llama-2/ \
---use_hpu_graphs --use_kv_cache --batch_size 1 --bf16 --max_new_tokens 100 \
---prompt "Here is my prompt"
-
-```
-
-
-## PPO pipeline
-
-### Training
-
-The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model.
-There are three main steps to the PPO training process:
-1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
-    ```
-    python ../gaudi_spawn.py --world_size 8 --use_mpi sft.py \
-        --model_name_or_path meta-llama/Llama-2-7b-hf \
-        --output_dir="./sft" \
-        --max_steps=500 \
-        --logging_steps=10 \
-        --save_steps=100 \
-        --per_device_train_batch_size=4 \
-        --per_device_eval_batch_size=1 \
-        --gradient_accumulation_steps=2 \
-        --learning_rate=1e-4 \
-        --lr_scheduler_type="cosine" \
-        --warmup_steps=100 \
-        --weight_decay=0.05 \
-        --optim="paged_adamw_32bit" \
-        --lora_target_modules "q_proj" "v_proj" \
-        --bf16 \
-        --remove_unused_columns=False \
-        --run_name="sft_llama2" \
-        --report_to=none \
-        --use_habana \
-        --use_lazy_mode
-    ```
-    To merge the adaptors to get the final sft merged checkpoint, we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-    ```
-    python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="sft" --output_name="sft/final_merged_checkpoint"
-    ```
-2. Reward modeling using dialog pairs from the SE dataset on the llama-v2-7b-se to create llama-v2-7b-se-rm
-    ```
-    python ../gaudi_spawn.py --world_size 8 --use_mpi reward_modeling.py \
-        --model_name=./sft/final_merged_checkpoint \
-        --tokenizer_name=meta-llama/Llama-2-7b-hf \
-        --output_dir=./rm
-    ```
-    To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-
-    ```
-    python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="rm" --output_name="rm_merged_checkpoint"
-    ```
-
-3. RL fine-tuning of llama-v2-7b-se with the llama-v2-7b-se-rm reward model:
-    ```
-    python ../gaudi_spawn.py --world_size 8 --use_mpi ppo.py \
-        --model_name=./sft/final_merged_checkpoint \
-        --reward_model_name=./rm_merged_checkpoint \
-        --tokenizer_name=meta-llama/Llama-2-7b-hf \
-        --adafactor=False \
-        --output_max_length=128 \
-        --batch_size=8 \
-        --gradient_accumulation_steps=8 \
-        --batched_gen=True \
-        --ppo_epochs=4 \
-        --seed=0 \
-        --learning_rate=1.4e-5 \
-        --early_stopping=True \
-        --output_dir=llama-se-rl-finetune
-    ```
-    To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-
-    ```
-    python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="llama-se-rl-finetune" --output_name="rl_merged_checkpoint"
-    ```
-
-### Running the model
-We can load the PPO-trained LoRA adaptors which were saved by the PPO training step and run it through the [text-generation example](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation).
-
-```
-python run_generation.py \
---model_name_or_path ../trl/rl_merged_checkpoint/ \
---use_hpu_graphs --use_kv_cache --batch_size 1 --bf16 --max_new_tokens 100 \
---prompt "Here is my prompt"
-```
-
-## DDPO pipeline
-
-### Training
-The following example is for fine-tuning stable diffusion using Denoising Diffusion Policy Optimization
-([DDPO](https://huggingface.co/docs/trl/en/ddpo_trainer)). The implementation supports LoRA and 
-non-LoRA-based training. LoRA based training is faster and less finicky to converge than non-LoRA
-based training. Recommendations for non-Lora based training (described [here](https://huggingface.co/blog/trl-ddpo)) 
-are setting the learning rate relatively low (e.g., 1e-5) and disabling mixed precision training. 
-HPU graphs are enabled by default for better performance.
-
-There are two main steps to the DDPO training process:
-
-1. Fine-tuning of the base stable-diffusion model with LoRA to create ddpo-aesthetic-predictor:
-```
-python ddpo.py \
-  --num_epochs=200 \
-  --train_gradient_accumulation_steps=1 \
-  --sample_num_steps=50 \
-  --sample_batch_size=6 \
-  --train_batch_size=3 \
-  --sample_num_batches_per_epoch=4 \
-  --per_prompt_stat_tracking=True \
-  --per_prompt_stat_tracking_buffer_size=32 \
-  --train_learning_rate=1e-05 \
-  --tracker_project_name="stable_diffusion_training" \
-  --log_with="tensorboard" \
-  --use_habana \
-  --use_hpu_graphs \
-  --bf16 \
-  --hf_hub_model_id="ddpo-finetuned-stable-diffusion" \
-  --push_to_hub False
-```
-   
-2. Inference using the fine-tuned LoRA weights as shown in the example below:
-```python
-import torch
-
-from optimum.habana import GaudiConfig
-from optimum.habana.trl import GaudiDefaultDDPOStableDiffusionPipeline
-
-gaudi_config = GaudiConfig.from_pretrained("Habana/stable-diffusion")
-model_id = "runwayml/stable-diffusion-v1-5"
-lora_model_id = "ddpo-finetuned-stable-diffusion"
-pipeline = GaudiDefaultDDPOStableDiffusionPipeline(
-    model_id,
-    use_habana=True,
-    use_hpu_graphs=True,
-    gaudi_config=gaudi_config,
-)
-pipeline.sd_pipeline.load_lora_weights(lora_model_id)
-device = torch.device("hpu")
-
-# memory optimization
-pipeline.vae.to(device, torch.bfloat16)
-pipeline.text_encoder.to(device, torch.bfloat16)
-pipeline.unet.to(device, torch.bfloat16)
-
-prompts = ["lion", "squirrel", "crab", "starfish", "whale", "sponge", "plankton"]
-results = pipeline(prompts)
-
-for prompt, image in zip(prompts, results.images):
-    image.save(f"{prompt}.png")
-```
diff --git a/examples/trl/ddpo.py b/examples/trl/ddpo.py
deleted file mode 100644
index c493c7165d..0000000000
--- a/examples/trl/ddpo.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright 2023 metric-space, The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Adapted from: https://github.com/huggingface/trl/blob/v0.7.8/examples/scripts/ddpo.py
-# The only differences are:
-#   - add new args gaudi_config
-#   - use GaudiDefaultDDPOStableDiffusionPipeline instead of DefaultDDPOStableDiffusionPipeline
-#   - use GaudiDDPOTrainer instead of DDPOTrainer
-#   - use hpu in aesthetic_scorer
-#   - cast model to bf16.
-
-"""
-python ddpo.py \
-    --num_epochs=200 \
-    --train_gradient_accumulation_steps=1 \
-    --sample_num_steps=50 \
-    --sample_batch_size=6 \
-    --train_batch_size=3 \
-    --sample_num_batches_per_epoch=4 \
-    --train_learning_rate=1e-05 \
-    --per_prompt_stat_tracking=True \
-    --per_prompt_stat_tracking_buffer_size=32 \
-    --tracker_project_name="stable_diffusion_training" \
-    --log_with="tensorboard" \
-    --bf16
-"""
-
-import os
-from dataclasses import dataclass, field
-
-import numpy as np
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError
-from transformers import CLIPModel, CLIPProcessor, HfArgumentParser
-from trl import DDPOConfig
-
-from optimum.habana import GaudiConfig
-from optimum.habana.trl import GaudiDDPOTrainer, GaudiDefaultDDPOStableDiffusionPipeline
-
-
-@dataclass
-class ScriptArguments:
-    hf_user_access_token: str = field(
-        default=None, metadata={"help": "Hugging Face token. If None, token is retrieved from env or cache."}
-    )
-    pretrained_model: str = field(
-        default="runwayml/stable-diffusion-v1-5", metadata={"help": "the pretrained model to use"}
-    )
-    pretrained_revision: str = field(default="main", metadata={"help": "the pretrained model revision to use"})
-    hf_hub_model_id: str = field(
-        default="ddpo-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to"}
-    )
-    hf_hub_aesthetic_model_id: str = field(
-        default="trl-lib/ddpo-aesthetic-predictor",
-        metadata={"help": "HuggingFace model ID for aesthetic scorer model weights"},
-    )
-    hf_hub_aesthetic_model_filename: str = field(
-        default="aesthetic-model.pth",
-        metadata={"help": "HuggingFace model filename for aesthetic scorer model weights"},
-    )
-    use_lora: bool = field(default=True, metadata={"help": "Whether to use LoRA."})
-    bf16: bool = field(default=False, metadata={"help": "Whether to use bf16 mixed precision."})
-    gaudi_config_name: str = field(
-        default="Habana/stable-diffusion", metadata={"help": "Name or path of the Gaudi configuration"}
-    )
-    push_to_hub: bool = field(default=False, metadata={"help": "Whether or not to push the model to the Hub."})
-    use_habana: bool = field(default=True, metadata={"help": "Whether or not to use HPU."})
-    use_hpu_graphs: bool = field(default=True, metadata={"help": "Whether or not to use hpu graphs."})
-
-
-class MLP(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.layers = nn.Sequential(
-            nn.Linear(768, 1024),
-            nn.Dropout(0.2),
-            nn.Linear(1024, 128),
-            nn.Dropout(0.2),
-            nn.Linear(128, 64),
-            nn.Dropout(0.1),
-            nn.Linear(64, 16),
-            nn.Linear(16, 1),
-        )
-
-    @torch.no_grad()
-    def forward(self, embed):
-        return self.layers(embed)
-
-
-class AestheticScorer(torch.nn.Module):
-    """
-    This model attempts to predict the aesthetic score of an image. The aesthetic score
-    is a numerical approximation of how much a specific image is liked by humans on average.
-    This is from https://github.com/christophschuhmann/improved-aesthetic-predictor
-    """
-
-    def __init__(self, *, dtype, model_id, model_filename):
-        super().__init__()
-        self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-        self.mlp = MLP()
-        try:
-            cached_path = hf_hub_download(model_id, model_filename)
-        except EntryNotFoundError:
-            cached_path = os.path.join(model_id, model_filename)
-        state_dict = torch.load(cached_path, map_location=torch.device("cpu"))
-        self.mlp.load_state_dict(state_dict)
-        self.dtype = dtype
-        self.eval()
-
-    @torch.no_grad()
-    def __call__(self, images):
-        device = next(self.parameters()).device
-        inputs = self.processor(images=images, return_tensors="pt")
-        inputs = {k: v.to(self.dtype).to(device) for k, v in inputs.items()}
-        embed = self.clip.get_image_features(**inputs)
-        # normalize embedding
-        embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)
-        return self.mlp(embed).squeeze(1)
-
-
-def aesthetic_scorer(hub_model_id, model_filename, use_habana=True):
-    scorer = AestheticScorer(
-        model_id=hub_model_id,
-        model_filename=model_filename,
-        dtype=torch.float32,
-    )
-    if use_habana:
-        scorer = scorer.to("hpu")
-    else:
-        scorer = scorer.to("cpu")
-
-    def _fn(images, prompts, metadata):
-        images = (images * 255).round().clamp(0, 255).to(torch.uint8)
-        scores = scorer(images)
-        return scores, {}
-
-    return _fn
-
-
-# list of example prompts to feed stable diffusion
-animals = [
-    "cat",
-    "dog",
-    "horse",
-    "monkey",
-    "rabbit",
-    "zebra",
-    "spider",
-    "bird",
-    "sheep",
-    "deer",
-    "cow",
-    "goat",
-    "lion",
-    "frog",
-    "chicken",
-    "duck",
-    "goose",
-    "bee",
-    "pig",
-    "turkey",
-    "fly",
-    "llama",
-    "camel",
-    "bat",
-    "gorilla",
-    "hedgehog",
-    "kangaroo",
-]
-
-
-def prompt_fn():
-    return np.random.choice(animals), {}
-
-
-def image_outputs_logger(image_data, global_step, accelerate_logger):
-    # For the sake of this example, we will only log the last batch of images
-    # and associated data
-    result = {}
-    images, prompts, _, rewards, _ = image_data[-1]
-
-    for i, image in enumerate(images):
-        prompt = prompts[i]
-        reward = rewards[i].item()
-        result[f"{prompt:.25} | {reward:.2f}"] = image.unsqueeze(0).float()
-
-    accelerate_logger.log_images(
-        result,
-        step=global_step,
-    )
-
-
-if __name__ == "__main__":
-    parser = HfArgumentParser((ScriptArguments, DDPOConfig))
-    args, ddpo_config = parser.parse_args_into_dataclasses()
-    ddpo_config.mixed_precision = "bf16" if args.bf16 else "no"
-    ddpo_config.project_kwargs = {
-        "logging_dir": "./logs",
-        "automatic_checkpoint_naming": True,
-        "total_limit": 5,
-        "project_dir": "./save",
-    }
-
-    # 1. initialize Gaudi config:
-    gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name) if args.use_habana else None
-
-    pipeline = GaudiDefaultDDPOStableDiffusionPipeline(
-        args.pretrained_model,
-        pretrained_model_revision=args.pretrained_revision,
-        use_lora=args.use_lora,
-        use_habana=args.use_habana,
-        use_hpu_graphs=args.use_hpu_graphs,
-        gaudi_config=gaudi_config,
-    )
-
-    trainer = GaudiDDPOTrainer(
-        ddpo_config,
-        aesthetic_scorer(
-            args.hf_hub_aesthetic_model_id,
-            args.hf_hub_aesthetic_model_filename,
-            args.use_habana,
-        ),
-        prompt_fn,
-        pipeline,
-        image_samples_hook=image_outputs_logger,
-        gaudi_config=gaudi_config,
-        use_habana=args.use_habana,
-        use_hpu_graphs=args.use_hpu_graphs,
-    )
-
-    trainer.train()
-
-    if args.push_to_hub:
-        trainer.push_to_hub(args.hf_hub_model_id, token=args.hf_user_access_token)
-    else:
-        trainer.save_pretrained(args.hf_hub_model_id)
diff --git a/examples/trl/dpo.py b/examples/trl/dpo.py
deleted file mode 100644
index 5779296bbb..0000000000
--- a/examples/trl/dpo.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py, enable it for Gaudi2
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-
-import torch
-from datasets import Dataset, load_dataset
-from peft import LoraConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
-from transformers.integrations.deepspeed import (
-    is_deepspeed_available,
-)
-
-from optimum.habana import GaudiConfig, GaudiTrainingArguments
-from optimum.habana.trl import GaudiDPOTrainer
-from optimum.habana.utils import set_seed
-
-
-# Define and parse arguments.
-@dataclass
-class ScriptArguments:
-    """
-    The arguments for the DPO training script.
-    """
-
-    # data parameters
-    beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"})
-
-    # training parameters
-    model_name_or_path: Optional[str] = field(
-        default="../sft/results/final_checkpoint",
-        metadata={"help": "the location of the SFT model name or path"},
-    )
-    tokenizer_name_or_path: Optional[str] = field(
-        default="meta-llama/Llama-2-7b-hf",
-        metadata={"help": "the location of the SFT model name or path"},
-    )
-    learning_rate: Optional[float] = field(default=5e-4, metadata={"help": "optimizer learning rate"})
-    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
-    warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
-    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
-    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})
-
-    per_device_train_batch_size: Optional[int] = field(default=1, metadata={"help": "train batch size per device"})
-    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "eval batch size per device"})
-    gradient_accumulation_steps: Optional[int] = field(
-        default=4, metadata={"help": "the number of gradient accumulation steps"}
-    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=False, metadata={"help": "whether to use gradient checkpointing"}
-    )
-
-    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
-    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
-    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
-    lora_target_modules: List[str] = field(
-        default_factory=lambda: None,
-        metadata={"help": "Target modules for the LoRA method."},
-    )
-    max_prompt_length: Optional[int] = field(default=512, metadata={"help": "the maximum prompt length"})
-    max_length: Optional[int] = field(default=1024, metadata={"help": "the maximum sequence length"})
-    max_steps: Optional[int] = field(default=1000, metadata={"help": "max number of training steps"})
-    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
-    save_steps: Optional[int] = field(default=100, metadata={"help": "the saving frequency"})
-    eval_steps: Optional[int] = field(default=100, metadata={"help": "the evaluation frequency"})
-
-    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
-    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})
-
-    # instrumentation
-    sanity_check: Optional[bool] = field(default=False, metadata={"help": "only train on 1000 samples"})
-    report_to: Optional[str] = field(
-        default="wandb",
-        metadata={
-            "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,'
-            '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. '
-            'Use `"all"` to report to all integrations installed, `"none"` for no integrations.'
-        },
-    )
-    # debug argument for distributed training
-    ignore_bias_buffers: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See"
-            "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992"
-        },
-    )
-    seed: Optional[int] = field(
-        default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
-    )
-    deepspeed: Optional[str] = field(default=None, metadata={"help": "the deepspeed json config file"})
-    num_workers: Optional[int] = field(default=None, metadata={"help": "the number of workers to map the data"})
-
-
-def get_stack_exchange_paired(
-    data_dir: str = "data/rl",
-    sanity_check: bool = False,
-    cache_dir: str = None,
-    num_proc=24,
-) -> Dataset:
-    """Load the stack-exchange-paired dataset from Hugging Face and convert it to the necessary format.
-
-    The dataset is converted to a dictionary with the following structure:
-    {
-        'prompt': List[str],
-        'chosen': List[str],
-        'rejected': List[str],
-    }
-
-    Prompts are structured as follows:
-      "Question: " + <prompt> + "\n\nAnswer: "
-    """
-    dataset = load_dataset(
-        "lvwerra/stack-exchange-paired",
-        split="train",
-        cache_dir=cache_dir,
-        data_dir=data_dir,
-    )
-    original_columns = dataset.column_names
-
-    if sanity_check:
-        dataset = dataset.select(range(min(len(dataset), 1000)))
-
-    def return_prompt_and_responses(samples) -> Dict[str, str]:
-        return {
-            "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]],
-            "chosen": samples["response_j"],
-            "rejected": samples["response_k"],
-        }
-
-    return dataset.map(
-        return_prompt_and_responses,
-        batched=True,
-        num_proc=num_proc,
-        remove_columns=original_columns,
-    )
-
-
-if __name__ == "__main__":
-    parser = HfArgumentParser(ScriptArguments)
-    script_args = parser.parse_args_into_dataclasses()[0]
-
-    # 1. initialize training arguments:
-    training_args = GaudiTrainingArguments(
-        per_device_train_batch_size=script_args.per_device_train_batch_size,
-        per_device_eval_batch_size=script_args.per_device_eval_batch_size,
-        max_steps=script_args.max_steps,
-        logging_steps=script_args.logging_steps,
-        save_steps=script_args.save_steps,
-        gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-        gradient_checkpointing=script_args.gradient_checkpointing,
-        learning_rate=script_args.learning_rate,
-        evaluation_strategy="steps",
-        eval_steps=script_args.eval_steps,
-        output_dir=script_args.output_dir,
-        report_to=script_args.report_to,
-        lr_scheduler_type=script_args.lr_scheduler_type,
-        warmup_steps=script_args.warmup_steps,
-        optim=script_args.optimizer_type,
-        bf16=True,
-        remove_unused_columns=False,
-        run_name="dpo_llama2",
-        use_habana=True,
-        use_lazy_mode=True,
-        use_hpu_graphs_for_training=not script_args.gradient_checkpointing and (not script_args.deepspeed),
-        use_hpu_graphs_for_inference=not script_args.deepspeed,
-        seed=script_args.seed,
-        deepspeed=script_args.deepspeed,
-        overwrite_output_dir=True,
-    )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    low_cpu_mem_usage = True
-    if is_deepspeed_available():
-        from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-
-        if is_deepspeed_zero3_enabled():
-            low_cpu_mem_usage = False
-
-    # 2. load a pretrained model
-    model = AutoModelForCausalLM.from_pretrained(
-        script_args.model_name_or_path,
-        low_cpu_mem_usage=low_cpu_mem_usage,
-        torch_dtype=torch.bfloat16,
-    )
-    model.config.use_cache = False
-    model.config.use_fused_rope = False
-
-    if script_args.ignore_bias_buffers:
-        # torch distributed hack
-        model._ddp_params_and_buffers_to_ignore = [
-            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
-        ]
-
-    model_ref = AutoModelForCausalLM.from_pretrained(
-        script_args.model_name_or_path,
-        low_cpu_mem_usage=low_cpu_mem_usage,
-        torch_dtype=torch.bfloat16,
-    )
-    model_ref.config.use_cache = False
-    tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name_or_path)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # 3. Load the Stack-exchange paired dataset
-    train_dataset = get_stack_exchange_paired(
-        data_dir="data/rl", sanity_check=script_args.sanity_check, num_proc=script_args.num_workers
-    )
-    train_dataset = train_dataset.filter(
-        lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
-        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
-    )
-
-    # 4. Load evaluation dataset
-    eval_dataset = get_stack_exchange_paired(
-        data_dir="data/evaluation", sanity_check=True, num_proc=script_args.num_workers
-    )
-    eval_dataset = eval_dataset.filter(
-        lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
-        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
-    )
-
-    peft_config = LoraConfig(
-        r=script_args.lora_r,
-        lora_alpha=script_args.lora_alpha,
-        lora_dropout=script_args.lora_dropout,
-        target_modules=script_args.lora_target_modules,
-        bias="none",
-        task_type="CAUSAL_LM",
-    )
-
-    gaudi_config = GaudiConfig()
-    gaudi_config.use_fused_adam = True
-    gaudi_config.use_fused_clip_norm = True
-
-    # 5. initialize the DPO trainer
-    dpo_trainer = GaudiDPOTrainer(
-        model,
-        model_ref,
-        gaudi_config=gaudi_config,
-        args=training_args,
-        beta=script_args.beta,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        tokenizer=tokenizer,
-        peft_config=peft_config,
-        max_prompt_length=script_args.max_prompt_length,
-        max_length=script_args.max_length,
-    )
-
-    # 6. train
-    train_result = dpo_trainer.train()
-
-    # 7. save
-    dpo_trainer.save_model(script_args.output_dir)
-
-    # 8. save metric
-    metrics = train_result.metrics
-    dpo_trainer.log_metrics("train", metrics)
-    dpo_trainer.save_metrics("train", metrics)
diff --git a/examples/trl/merge_peft_adapter.py b/examples/trl/merge_peft_adapter.py
deleted file mode 100644
index 8913fc62a4..0000000000
--- a/examples/trl/merge_peft_adapter.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py.
-# only difference is removal of model.push_to_hub
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-from peft import PeftConfig, PeftModel
-from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser
-
-
-@dataclass
-class ScriptArguments:
-    """
-    The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the
-    merged model.
-    """
-
-    adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"})
-    base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"})
-    output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"})
-
-
-parser = HfArgumentParser(ScriptArguments)
-script_args = parser.parse_args_into_dataclasses()[0]
-assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge"
-assert script_args.base_model_name is not None, "please provide the name of the Base model"
-assert script_args.output_name is not None, "please provide the output name of the merged model"
-
-peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
-if peft_config.task_type == "SEQ_CLS":
-    # The sequence classification task is used for the reward model in PPO
-    model = AutoModelForSequenceClassification.from_pretrained(
-        script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16
-    )
-else:
-    model = AutoModelForCausalLM.from_pretrained(
-        script_args.base_model_name, return_dict=True, torch_dtype=torch.bfloat16
-    )
-
-tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name)
-
-# Load the PEFT model
-model = PeftModel.from_pretrained(model, script_args.adapter_model_name)
-model.eval()
-
-model = model.merge_and_unload()
-
-model.save_pretrained(f"{script_args.output_name}")
-tokenizer.save_pretrained(f"{script_args.output_name}")
-# model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False)
diff --git a/examples/trl/ppo.py b/examples/trl/ppo.py
deleted file mode 100644
index d4ad127641..0000000000
--- a/examples/trl/ppo.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/rl_training.py, enable it for Gaudi2
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-import torch
-from datasets import load_dataset
-from peft import LoraConfig
-from tqdm import tqdm
-from transformers import Adafactor, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser, pipeline
-from trl import AutoModelForCausalLMWithValueHead
-from trl.core import LengthSampler
-
-from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.trl import GaudiPPOConfig, GaudiPPOTrainer, adapt_PreTrainedModelWrapper_to_gaudi
-from optimum.habana.utils import set_seed
-
-
-tqdm.pandas()
-
-
-@dataclass
-class ScriptArguments:
-    """
-    The name of the Casual LM model we wish to fine with PPO
-    """
-
-    # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
-    # models like gpt-neo* models are more suitable.
-    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
-    tokenizer_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the tokenizer name"})
-    reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"})
-    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
-    learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
-    output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum output length for generation"})
-    input_max_length: Optional[int] = field(default=512, metadata={"help": "maximum input length for generation"})
-    mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
-    batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
-    ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"})
-    gradient_accumulation_steps: Optional[int] = field(
-        default=4, metadata={"help": "the number of gradient accumulation steps"}
-    )
-    adafactor: Optional[bool] = field(default=False, metadata={"help": "whether to use the adafactor optimizer"})
-    early_stopping: Optional[bool] = field(default=False, metadata={"help": "whether to early stop"})
-    target_kl: Optional[float] = field(default=0.1, metadata={"help": "kl target for early stopping"})
-    reward_baseline: Optional[float] = field(
-        default=0.0,
-        metadata={"help": "a baseline value that is subtracted from the reward"},
-    )
-    batched_gen: Optional[bool] = field(default=False, metadata={"help": "whether to use the batched text gen"})
-    save_freq: Optional[int] = field(default=None, metadata={"help": "n steps to save the model"})
-    output_dir: Optional[str] = field(default="runs/", metadata={"help": "n steps to save the model"})
-    seed: Optional[int] = field(default=0, metadata={"help": "the seed"})
-    steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"})
-    init_kl_coef: Optional[float] = field(
-        default=0.2,
-        metadata={"help": "Initial KL penalty coefficient (used for adaptive and linear control)"},
-    )
-
-    adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"})
-    use_habana: Optional[bool] = field(default=True, metadata={"help": "use habana for RL training"})
-    lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"})
-    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
-    lora_r: Optional[int] = field(default=16, metadata={"help": "the lora r parameter"})
-    lora_target_modules: List[str] = field(
-        default_factory=lambda: None,
-        metadata={"help": "Target modules for the LoRA method."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-
-
-adapt_PreTrainedModelWrapper_to_gaudi()
-parser = HfArgumentParser(ScriptArguments)
-script_args: ScriptArguments = parser.parse_args_into_dataclasses()[0]
-reward_model_name = script_args.reward_model_name
-dataset_name = "lvwerra/stack-exchange-paired"
-config = GaudiPPOConfig(
-    steps=script_args.steps,
-    model_name=script_args.model_name,
-    learning_rate=script_args.learning_rate,
-    log_with=script_args.log_with,
-    batch_size=script_args.batch_size,
-    mini_batch_size=script_args.mini_batch_size,
-    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-    optimize_cuda_cache=True,
-    early_stopping=script_args.early_stopping,
-    target_kl=script_args.target_kl,
-    ppo_epochs=script_args.ppo_epochs,
-    seed=script_args.seed,
-    init_kl_coef=script_args.init_kl_coef,
-    adap_kl_ctrl=script_args.adap_kl_ctrl,
-    use_habana=script_args.use_habana,
-    pad_max_len=script_args.input_max_length + script_args.output_max_length,
-    pad_max_input_len=script_args.input_max_length,
-)
-
-train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train")
-if script_args.max_train_samples is not None:
-    max_train_samples = min(len(train_dataset), script_args.max_train_samples)
-    train_dataset = train_dataset.select(range(max_train_samples))
-original_columns = train_dataset.column_names
-
-# We then define the arguments to pass to the sentiment analysis pipeline.
-# We set `return_all_scores` to True to get the sentiment score for each token.
-sent_kwargs = {
-    "return_all_scores": True,
-    "function_to_apply": "none",
-    "batch_size": 16,
-    "truncation": True,
-}
-if config.pad_for_acceleration:
-    sent_kwargs["padding"] = "max_length"
-    sent_kwargs["max_length"] = script_args.input_max_length + script_args.output_max_length
-
-tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)
-# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
-# only for this model.
-
-if getattr(tokenizer, "pad_token", None) is None:
-    tokenizer.pad_token = tokenizer.eos_token
-
-
-# Below is an example function to build the dataset. In our case, we use the IMDB dataset
-# from the `datasets` library. One should customize this function to train the model on
-# its own dataset.
-def build_dataset(
-    tokenizer,
-    dataset_name="lvwerra/stack-exchange-paired",
-):
-    """
-    Build dataset for training. This builds the dataset from `load_dataset`, one should
-    customize this function to train the model on its own dataset.
-
-    Args:
-        dataset_name (`str`):
-            The name of the dataset to be loaded.
-
-    Returns:
-        dataloader (`torch.utils.data.DataLoader`):
-            The dataloader for the dataset.
-    """
-
-    num_proc = 24
-
-    def preprocess_function(examples):
-        new_examples = {
-            "query": [],
-            "input_ids": [],
-        }
-        for question in examples["question"]:
-            query = "Question: " + question + "\n\nAnswer: "
-            tokenized_question = tokenizer(query, truncation=True)
-            new_examples["query"].append(query)
-            new_examples["input_ids"].append(tokenized_question["input_ids"])
-
-        return new_examples
-
-    ds = train_dataset.map(
-        preprocess_function,
-        batched=True,
-        num_proc=num_proc,
-        remove_columns=original_columns,
-    )
-    ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False)
-
-    ds.set_format(type="torch")
-    return ds
-
-
-# We retrieve the dataloader by calling the `build_dataset` function.
-dataset = build_dataset(tokenizer)
-
-
-def collator(data):
-    return {key: [d[key] for d in data] for key in data[0]}
-
-
-# set seed before initializing value head for deterministic eval
-set_seed(config.seed)
-
-# Now let's build the model, the reference model, and the tokenizer.
-current_device = GaudiAccelerator().local_process_index
-lora_config = LoraConfig(
-    r=script_args.lora_r,
-    lora_alpha=script_args.lora_alpha,
-    lora_dropout=script_args.lora_dropout,
-    target_modules=script_args.lora_target_modules,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-model = AutoModelForCausalLMWithValueHead.from_pretrained(
-    config.model_name,
-    peft_config=lora_config,
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-)
-model.config.use_fused_rope = False
-model.config.use_fused_rms_norm = False
-optimizer = None
-model = model.to(torch.bfloat16)
-
-if script_args.use_habana:
-    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
-        config.model_name,
-        torch_dtype=torch.bfloat16,
-        low_cpu_mem_usage=True,
-    )
-else:
-    ref_model = None
-if script_args.adafactor:
-    optimizer = Adafactor(
-        filter(lambda p: p.requires_grad, model.parameters()),
-        scale_parameter=False,
-        relative_step=False,
-        warmup_init=False,
-        lr=config.learning_rate,
-    )
-# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
-ppo_trainer = GaudiPPOTrainer(
-    config,
-    model,
-    ref_model=ref_model,
-    tokenizer=tokenizer,
-    dataset=dataset,
-    data_collator=collator,
-    optimizer=optimizer,
-)
-
-# We then build the sentiment analysis pipeline using our reward model, passing the
-# model name and the sentiment analysis pipeline arguments. Let's also make sure to
-# set the device to the same device as the PPOTrainer.
-device = ppo_trainer.accelerator.device
-
-reward_model = AutoModelForSequenceClassification.from_pretrained(
-    reward_model_name,
-    num_labels=1,
-    low_cpu_mem_usage=True,
-)
-
-if config.use_habana:
-    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-    reward_model = wrap_in_hpu_graph(reward_model)
-
-if device.type == "hpu":
-    device = "hpu"
-
-sentiment_pipe = pipeline(
-    "sentiment-analysis",
-    model=reward_model,
-    tokenizer=tokenizer,
-    return_token_type_ids=False,
-    device=device,
-    model_kwargs={
-        "low_cpu_mem_usage": True,
-        "torch_dtype": torch.bfloat16,
-    },
-)
-
-if sentiment_pipe.model.config.pad_token_id is None:
-    sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id
-# We then define the arguments to pass to the `generate` function. These arguments
-# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
-# the `generate` function of the trained model.
-generation_kwargs = {
-    # "min_length": -1,
-    "top_k": 0.0,
-    "top_p": 1.0,
-    "do_sample": True,
-    "pad_token_id": tokenizer.pad_token_id,
-    "eos_token_id": 100_000,
-}
-output_min_length = 32
-output_max_length = script_args.output_max_length
-if not config.pad_for_acceleration:
-    output_length_sampler = LengthSampler(output_min_length, output_max_length)
-else:
-    output_length_sampler = LengthSampler(output_max_length, output_max_length + 1)
-for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
-    if epoch >= config.total_ppo_epochs:
-        break
-
-    question_tensors = batch["input_ids"]
-
-    response_tensors = ppo_trainer.generate(
-        question_tensors,
-        return_prompt=False,
-        length_sampler=output_length_sampler,
-        **generation_kwargs,
-    )
-    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
-
-    # Compute reward score (using the sentiment analysis pipeline)
-    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
-    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
-    rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs]
-
-    # Run PPO step
-    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
-    ppo_trainer.log_stats(stats, batch, rewards)
-
-    if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
-        ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")
-
-ppo_trainer.save_pretrained(script_args.output_dir)
diff --git a/examples/trl/requirements.txt b/examples/trl/requirements.txt
deleted file mode 100644
index 37651ba1b5..0000000000
--- a/examples/trl/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-trl == 0.8.6
-peft == 0.6.2
-datasets == 2.19.2
-wandb == 0.17.1
-tyro == 0.8.4
-evaluate == 0.4.2
-scikit-learn == 1.5.0
diff --git a/examples/trl/reward_modeling.py b/examples/trl/reward_modeling.py
deleted file mode 100644
index f67d657944..0000000000
--- a/examples/trl/reward_modeling.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py, enable it for Gaudi2
-
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-import evaluate
-import numpy as np
-import torch
-from datasets import load_dataset
-from peft import LoraConfig, TaskType, get_peft_model
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    HfArgumentParser,
-    TrainerCallback,
-)
-
-from optimum.habana import GaudiConfig, GaudiTrainingArguments
-from optimum.habana.trl import GaudiRewardTrainer, RewardDataCollatorWithPadding
-from optimum.habana.utils import set_seed
-
-
-# Define and parse arguments.
-@dataclass
-class ScriptArguments:
-    """
-    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
-    """
-
-    local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
-    resume_from_checkpoint: Optional[bool] = field(
-        default=False,
-        metadata={"help": "If you want to resume training where it left off."},
-    )
-    deepspeed: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Path to deepspeed config if using deepspeed. You may need this if the model that you want to train doesn't fit on a single GPU."
-        },
-    )
-    per_device_train_batch_size: Optional[int] = field(default=4)
-    per_device_eval_batch_size: Optional[int] = field(default=1)
-    gradient_accumulation_steps: Optional[int] = field(default=1)
-    learning_rate: Optional[float] = field(default=2e-5)
-    weight_decay: Optional[float] = field(default=0.001)
-    model_name: Optional[str] = field(
-        default="meta-llama/Llama-2-7b-hf",
-        metadata={
-            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
-        },
-    )
-    tokenizer_name: Optional[str] = field(
-        default="meta-llama/Llama-2-7b-hf",
-        metadata={
-            "help": "The tokenizer for your model, if left empty will use the default for your model",
-        },
-    )
-    bf16: Optional[bool] = field(
-        default=True,
-        metadata={
-            "help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU."
-        },
-    )
-    num_train_epochs: Optional[int] = field(
-        default=1,
-        metadata={"help": "The number of training epochs for the reward model."},
-    )
-    train_subset: Optional[int] = field(
-        default=100000,
-        metadata={"help": "The size of the subset of the training data to use"},
-    )
-    eval_subset: Optional[int] = field(
-        default=50000,
-        metadata={"help": "The size of the subset of the eval data to use"},
-    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Enables gradient checkpointing."},
-    )
-    optim: Optional[str] = field(
-        default="adamw_hf",
-        metadata={"help": "The optimizer to use."},
-    )
-    lr_scheduler_type: Optional[str] = field(
-        default="linear",
-        metadata={"help": "The lr scheduler"},
-    )
-    max_length: Optional[int] = field(default=512)
-    eval_first_step: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to run eval after the first step"},
-    )
-    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
-    save_steps: Optional[int] = field(default=500, metadata={"help": "the saving frequency"})
-    eval_steps: Optional[int] = field(default=500, metadata={"help": "the evaluation frequency"})
-    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
-    lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"})
-    lora_dropout: Optional[float] = field(default=0.1, metadata={"help": "the lora dropout parameter"})
-    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
-    lora_target_modules: List[str] = field(
-        default_factory=lambda: None,
-        metadata={"help": "Target modules for the LoRA method."},
-    )
-    seed: Optional[int] = field(
-        default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-
-
-parser = HfArgumentParser(ScriptArguments)
-script_args = parser.parse_args_into_dataclasses()[0]
-set_seed(script_args.seed)
-# Load the human stack-exchange-paired dataset for tuning the reward model.
-train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/reward", split="train")
-if script_args.train_subset > 0:
-    train_dataset = train_dataset.select(range(script_args.train_subset))
-eval_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/evaluation", split="train")
-if script_args.eval_subset > 0:
-    eval_dataset = eval_dataset.select(range(script_args.eval_subset))
-# Define the training args. Needs to be done before the model is loaded if you are using deepspeed.
-
-training_args = GaudiTrainingArguments(
-    output_dir=script_args.output_dir,
-    learning_rate=script_args.learning_rate,
-    per_device_train_batch_size=script_args.per_device_train_batch_size,
-    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
-    num_train_epochs=script_args.num_train_epochs,
-    weight_decay=script_args.weight_decay,
-    evaluation_strategy="steps",
-    eval_steps=script_args.eval_steps,
-    save_strategy="steps",
-    save_steps=script_args.save_steps,
-    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-    gradient_checkpointing=script_args.gradient_checkpointing,
-    deepspeed=script_args.deepspeed,
-    local_rank=script_args.local_rank,
-    remove_unused_columns=False,
-    label_names=[],
-    bf16=script_args.bf16,
-    logging_strategy="steps",
-    logging_steps=script_args.logging_steps,
-    optim=script_args.optim,
-    lr_scheduler_type=script_args.lr_scheduler_type,
-    report_to="none",
-    use_habana=True,
-    use_lazy_mode=True,
-    seed=script_args.seed,
-)
-
-# Load the value-head model and tokenizer.
-tokenizer_name = script_args.tokenizer_name if script_args.tokenizer_name is not None else script_args.model_name
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=script_args.token)
-tokenizer.pad_token = tokenizer.eos_token
-
-peft_config = LoraConfig(
-    task_type=TaskType.SEQ_CLS,
-    inference_mode=False,
-    r=script_args.lora_r,
-    lora_alpha=script_args.lora_alpha,
-    lora_dropout=script_args.lora_dropout,
-    target_modules=script_args.lora_target_modules,
-    bias="none",
-)
-torch.autograd.set_detect_anomaly(True)
-model = AutoModelForSequenceClassification.from_pretrained(
-    script_args.model_name, num_labels=1, torch_dtype=torch.bfloat16
-)
-
-model = get_peft_model(model, peft_config)
-model.print_trainable_parameters()
-
-# Need to do this for gpt2, because it doesn't have an official pad token.
-tokenizer.pad_token = tokenizer.eos_token
-model.config.pad_token_id = tokenizer.eos_token_id
-model.config.use_cache = not script_args.gradient_checkpointing
-model.config.use_fused_rope = False
-num_proc = 24  # Can adjust to be higher if you have more processors.
-original_columns = train_dataset.column_names
-
-
-# Turn the dataset into pairs of post + summaries, where text_j is the preferred question + answer and text_k is the other.
-# Then tokenize the dataset.
-def preprocess_function(examples):
-    new_examples = {
-        "input_ids_j": [],
-        "attention_mask_j": [],
-        "input_ids_k": [],
-        "attention_mask_k": [],
-    }
-    for question, response_j, response_k in zip(examples["question"], examples["response_j"], examples["response_k"]):
-        tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " + response_j, truncation=True)
-        tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " + response_k, truncation=True)
-
-        new_examples["input_ids_j"].append(tokenized_j["input_ids"])
-        new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
-        new_examples["input_ids_k"].append(tokenized_k["input_ids"])
-        new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])
-
-    return new_examples
-
-
-# preprocess the dataset and filter out QAs that are longer than script_args.max_length
-train_dataset = train_dataset.map(
-    preprocess_function,
-    batched=True,
-    num_proc=num_proc,
-    remove_columns=original_columns,
-)
-train_dataset = train_dataset.filter(
-    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
-)
-
-eval_dataset = eval_dataset.map(
-    preprocess_function,
-    batched=True,
-    num_proc=num_proc,
-    remove_columns=original_columns,
-)
-eval_dataset = eval_dataset.filter(
-    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
-)
-
-# Define the metric that we'll use for validation.
-accuracy = evaluate.load("accuracy")
-
-
-def compute_metrics(eval_pred):
-    predictions, _ = eval_pred
-    # Here, predictions is rewards_j and rewards_k.
-    # We want to see how much of the time rewards_j > rewards_k.
-    predictions = np.argmax(predictions, axis=0)
-    labels = np.zeros(predictions.shape)
-    return accuracy.compute(predictions=predictions, references=labels)
-
-
-gaudi_config = GaudiConfig()
-gaudi_config.use_fused_adam = True
-gaudi_config.use_fused_clip_norm = True
-
-# Train the model, woohoo.
-trainer = GaudiRewardTrainer(
-    model=model,
-    gaudi_config=gaudi_config,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    compute_metrics=compute_metrics,
-    data_collator=RewardDataCollatorWithPadding(
-        tokenizer=tokenizer, max_length=script_args.max_length, padding="max_length"
-    ),
-)
-
-
-if script_args.eval_first_step:
-
-    class EvaluateFirstStepCallback(TrainerCallback):
-        def on_step_end(self, args, state, control, **kwargs):
-            if state.global_step == 1:
-                control.should_evaluate = True
-
-    trainer.add_callback(EvaluateFirstStepCallback())
-
-trainer.train(script_args.resume_from_checkpoint)
-
-print("Saving last checkpoint of the model")
-trainer.save_model(script_args.output_dir)
diff --git a/examples/trl/sft.py b/examples/trl/sft.py
deleted file mode 100644
index 6edaa43308..0000000000
--- a/examples/trl/sft.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Fine-Tune Llama2-7b on SE paired dataset
-# copy from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama_2/scripts/sft_llama2.py, enable it for Gaudi2
-import logging
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-import torch
-import transformers
-from datasets import load_dataset
-from peft import LoraConfig
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
-from transformers.integrations.deepspeed import (
-    is_deepspeed_available,
-)
-from trl.trainer import ConstantLengthDataset
-
-from optimum.habana import GaudiConfig, GaudiTrainingArguments
-from optimum.habana.trl import GaudiSFTTrainer
-from optimum.habana.utils import set_seed
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ScriptArguments:
-    model_name_or_path: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
-    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
-    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
-    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
-    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
-    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
-    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
-    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
-    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})
-    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})
-
-    # LoraConfig
-    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
-    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
-    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
-    lora_target_modules: List[str] = field(
-        default_factory=lambda: None,
-        metadata={"help": "Target modules for the LoRA method."},
-    )
-
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-
-
-parser = HfArgumentParser((ScriptArguments, GaudiTrainingArguments))
-script_args, training_args = parser.parse_args_into_dataclasses()
-peft_config = LoraConfig(
-    r=script_args.lora_r,
-    lora_alpha=script_args.lora_alpha,
-    lora_dropout=script_args.lora_dropout,
-    target_modules=script_args.lora_target_modules,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-if training_args.group_by_length and script_args.packing:
-    raise ValueError("Cannot use both packing and group by length")
-
-set_seed(training_args.seed)
-
-
-def chars_token_ratio(dataset, tokenizer, nb_examples=400):
-    """
-    Estimate the average number of characters per token in the dataset.
-    """
-    total_characters, total_tokens = 0, 0
-    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
-        text = prepare_sample_text(example)
-        total_characters += len(text)
-        if tokenizer.is_fast:
-            total_tokens += len(tokenizer(text).tokens())
-        else:
-            total_tokens += len(tokenizer.tokenize(text))
-
-    return total_characters / total_tokens
-
-
-def prepare_sample_text(example):
-    """Prepare the text from a sample of the dataset."""
-    text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}"
-    return text
-
-
-def create_datasets(tokenizer, args, seed=None):
-    dataset = load_dataset(
-        args.dataset_name,
-        data_dir=args.subset,
-        split=args.split,
-        token=script_args.token,
-        num_proc=args.num_workers if not args.streaming else None,
-        streaming=args.streaming,
-    )
-    if args.streaming:
-        print("Loading the dataset in streaming mode")
-        valid_data = dataset.take(args.size_valid_set)
-        train_data = dataset.skip(args.size_valid_set)
-        train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=seed)
-    else:
-        dataset = dataset.train_test_split(test_size=0.005, seed=seed)
-        train_data = dataset["train"]
-        valid_data = dataset["test"]
-        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
-
-    chars_per_token = chars_token_ratio(train_data, tokenizer)
-    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
-
-    train_dataset = ConstantLengthDataset(
-        tokenizer,
-        train_data,
-        formatting_func=prepare_sample_text,
-        infinite=True,
-        seq_length=args.seq_length,
-        chars_per_token=chars_per_token,
-    )
-    valid_dataset = ConstantLengthDataset(
-        tokenizer,
-        valid_data,
-        formatting_func=prepare_sample_text,
-        infinite=False,
-        seq_length=args.seq_length,
-        chars_per_token=chars_per_token,
-    )
-    return train_dataset, valid_dataset
-
-
-low_cpu_mem_usage = True
-if is_deepspeed_available():
-    from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-
-    if is_deepspeed_zero3_enabled():
-        low_cpu_mem_usage = False
-
-base_model = AutoModelForCausalLM.from_pretrained(
-    script_args.model_name_or_path,
-    low_cpu_mem_usage=low_cpu_mem_usage,
-    torch_dtype=torch.bfloat16,
-    token=script_args.token,
-)
-base_model.config.use_cache = False
-base_model.config.use_fused_rope = False
-
-tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path, trust_remote_code=True)
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
-
-log_level = training_args.get_process_log_level()
-logger.setLevel(log_level)
-transformers.utils.logging.set_verbosity(log_level)
-transformers.utils.logging.enable_default_handler()
-transformers.utils.logging.enable_explicit_format()
-
-train_dataset, eval_dataset = create_datasets(tokenizer, script_args, seed=training_args.seed)
-
-gaudi_config = GaudiConfig()
-gaudi_config.use_fused_adam = True
-gaudi_config.use_fused_clip_norm = True
-trainer = GaudiSFTTrainer(
-    model=base_model,
-    gaudi_config=gaudi_config,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    peft_config=peft_config,
-    packing=script_args.packing,
-    max_seq_length=None,
-    tokenizer=tokenizer,
-    args=training_args,
-)
-train_result = trainer.train()
-trainer.save_model(training_args.output_dir)
-metrics = train_result.metrics
-trainer.log_metrics("train", metrics)
-trainer.save_metrics("train", metrics)
diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md
deleted file mode 100644
index efbe1f2a92..0000000000
--- a/examples/visual-question-answering/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Visual Question Answering Examples
-
-This directory contains a script that showcases how to use the Transformers pipeline API to run visual question answering task on HPUs.
-
-## Single-HPU inference
-
-```bash
-python3 run_pipeline.py \
-    --model_name_or_path Salesforce/blip-vqa-capfilt-large \
-    --image_path "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" \
-    --question "how many dogs are in the picture?" \
-    --use_hpu_graphs \
-    --bf16
-```
-
-Models that have been validated:
-  - [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base)
-  - [dandelin/vilt-b32-finetuned-vqa](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
-  - [Salesforce/blip-vqa-capfilt-large](https://huggingface.co/Salesforce/blip-vqa-capfilt-large)
\ No newline at end of file
diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py
deleted file mode 100644
index 7b4e817bb7..0000000000
--- a/examples/visual-question-answering/run_pipeline.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-import argparse
-import logging
-import time
-
-import PIL.Image
-import requests
-import torch
-from transformers import pipeline
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        help="Path to pre-trained model",
-    )
-    parser.add_argument(
-        "--image_path",
-        default=None,
-        type=str,
-        nargs="*",
-        help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")',
-    )
-    parser.add_argument(
-        "--topk",
-        default=1,
-        type=int,
-        help="topk num",
-    )
-    parser.add_argument(
-        "--question",
-        default=None,
-        type=str,
-        nargs="*",
-        help='question as input. Can be a single string (eg: --question "Q1"), or a list of space-separated strings (eg: --question "Q1" "Q2")',
-    )
-    parser.add_argument(
-        "--use_hpu_graphs",
-        action="store_true",
-        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Whether to perform in bf16 precision.",
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
-    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
-    args = parser.parse_args()
-
-    adapt_transformers_to_gaudi()
-    image_paths = args.image_path
-    image_paths_len = len(image_paths)
-
-    if args.batch_size > image_paths_len:
-        # Dynamically extends to support larger batch sizes
-        num_path_to_add = args.batch_size - image_paths_len
-        for i in range(num_path_to_add):
-            image_paths.append(image_paths[i % image_paths_len])
-    elif args.batch_size < image_paths_len:
-        image_paths = image_paths[: args.batch_size]
-
-    questions = args.question
-    questions_len = len(questions)
-    if args.batch_size > questions_len:
-        # Dynamically extends to support larger batch sizes
-        num_question_to_add = args.batch_size - questions_len
-        for i in range(num_question_to_add):
-            questions.append(questions[i % questions_len])
-    elif args.batch_size < questions_len:
-        questions = questions[: args.batch_size]
-
-    images = []
-
-    for image_path in image_paths:
-        images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw).convert("RGB"))
-
-    if args.bf16:
-        model_dtype = torch.bfloat16
-    else:
-        model_dtype = torch.float32
-
-    generator = pipeline(
-        "visual-question-answering",
-        model=args.model_name_or_path,
-        torch_dtype=model_dtype,
-        device="hpu",
-    )
-    if args.use_hpu_graphs:
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        generator.model = wrap_in_hpu_graph(generator.model)
-
-    autocast_enable = model_dtype == torch.bfloat16
-    model_input = []
-    for i in range(args.batch_size):
-        model_input.append({"image": images[i], "question": questions[i]})
-
-    # warm up
-    for i in range(args.warmup):
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
-            generator(model_input, batch_size=args.batch_size, topk=args.topk)
-
-    start = time.time()
-    for i in range(args.n_iterations):
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
-            result = generator(model_input, batch_size=args.batch_size, topk=args.topk)
-    end = time.time()
-    logger.info(f"result = {result}, time = {(end-start) * 1000/args.n_iterations}ms")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/zero-shot-object-detection/README.md b/examples/zero-shot-object-detection/README.md
deleted file mode 100644
index eea67a8ce8..0000000000
--- a/examples/zero-shot-object-detection/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-<!---
-Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Zero Shot Object Detection Example
-
-This folder contains an example script which demonstrates the usage of OWL-ViT to run zero shot object detection task on Gaudi platform.
-
-## Single-HPU inference
-
-```bash
-python3 run_example.py \
-    --model_name_or_path google/owlvit-base-patch32 \
-    --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" \
-    --prompt "a photo of a cat, a photo of a dog" \
-    --use_hpu_graphs \
-    --bf16
-```
-
-Model that have been validated:
-  - [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32)
\ No newline at end of file
diff --git a/examples/zero-shot-object-detection/run_example.py b/examples/zero-shot-object-detection/run_example.py
deleted file mode 100644
index 06cd95daa8..0000000000
--- a/examples/zero-shot-object-detection/run_example.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# Copied from https://huggingface.co/docs/transformers/model_doc/owlvit
-
-import argparse
-import time
-
-import habana_frameworks.torch as ht
-import requests
-import torch
-from PIL import Image
-from transformers import AutoProcessor, OwlViTForObjectDetection
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name_or_path",
-        default="google/owlvit-base-patch32",
-        type=str,
-        help="Path of the pre-trained model",
-    )
-    parser.add_argument(
-        "--image_path",
-        default="http://images.cocodataset.org/val2017/000000039769.jpg",
-        type=str,
-        help='Path of the input image. Should be a single string (eg: --image_path "URL")',
-    )
-    parser.add_argument(
-        "--prompt",
-        default="a photo of a cat, a photo of a dog",
-        type=str,
-        help='Prompt for classification. It should be a string seperated by comma. (eg: --prompt "a photo of a cat, a photo of a dog")',
-    )
-    parser.add_argument(
-        "--use_hpu_graphs",
-        action="store_true",
-        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
-    )
-    parser.add_argument(
-        "--bf16",
-        action="store_true",
-        help="Whether to use bf16 precision for classification.",
-    )
-    parser.add_argument(
-        "--print_result",
-        action="store_true",
-        help="Whether to print the classification results.",
-    )
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
-    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
-
-    args = parser.parse_args()
-
-    adapt_transformers_to_gaudi()
-
-    processor = AutoProcessor.from_pretrained(args.model_name_or_path)
-    model = OwlViTForObjectDetection.from_pretrained(args.model_name_or_path)
-
-    image = Image.open(requests.get(args.image_path, stream=True).raw)
-    texts = []
-    for text in args.prompt.split(","):
-        texts.append(text)
-
-    if args.use_hpu_graphs:
-        model = ht.hpu.wrap_in_hpu_graph(model)
-
-    autocast = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=args.bf16)
-    model.to("hpu")
-
-    with torch.no_grad(), autocast:
-        for i in range(args.warmup):
-            inputs = processor(text=texts, images=image, return_tensors="pt").to("hpu")
-            outputs = model(**inputs)
-            torch.hpu.synchronize()
-
-        total_model_time = 0
-        for i in range(args.n_iterations):
-            inputs = processor(text=texts, images=image, return_tensors="pt").to("hpu")
-            model_start_time = time.time()
-            outputs = model(**inputs)
-            torch.hpu.synchronize()
-            model_end_time = time.time()
-            total_model_time = total_model_time + (model_end_time - model_start_time)
-
-            if args.print_result:
-                # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
-                target_sizes = torch.Tensor([image.size[::-1]])
-
-                # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
-                results = processor.post_process_object_detection(
-                    outputs=outputs, target_sizes=target_sizes, threshold=0.1
-                )
-                if i == 0:
-                    boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
-                    for box, score, label in zip(boxes, scores, labels):
-                        box = [round(i, 2) for i in box.tolist()]
-                        print(f"Detected {texts[label]} with confidence {round(score.item(), 3)} at location {box}")
-
-    print("n_iterations: " + str(args.n_iterations))
-    print("Total latency (ms): " + str(total_model_time * 1000))
-    print("Average latency (ms): " + str(total_model_time * 1000 / args.n_iterations))
diff --git a/notebooks/AI_HW_Summit_2022.ipynb b/notebooks/AI_HW_Summit_2022.ipynb
deleted file mode 100644
index 4db1b73583..0000000000
--- a/notebooks/AI_HW_Summit_2022.ipynb
+++ /dev/null
@@ -1,442 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "455917eb",
-   "metadata": {},
-   "source": [
-    "![](../readme_logo.png)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d1356f1f",
-   "metadata": {},
-   "source": [
-    "# Fine-tuning GPT2-XL with 🤗 Optimum Habana\n",
-    "\n",
-    "This notebook shows how to fine-tune GPT2-XL for causal language modeling with Optimum Habana. You can find more information in the [documentation](https://huggingface.co/docs/optimum/habana/index) and in the [package repository](https://github.com/huggingface/optimum-habana).\n",
-    "\n",
-    "Any other model that has been validated for language modeling (see [here](https://huggingface.co/docs/optimum/habana/index)) can be used, like BERT or RoBERTa."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d9ecd62a",
-   "metadata": {},
-   "source": [
-    "## What is Causal Language Modeling?\n",
-    "\n",
-    "Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the model **only attends to the left context** (tokens on the left of the mask). Such a training is particularly interesting for generation tasks.\n",
-    "\n",
-    "Here is an example of inputs that could be used for causal language modeling:\n",
-    "\n",
-    "> This live AI webinar is organized by Habana Labs and Hugging Face and"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d7a1ee47",
-   "metadata": {},
-   "source": [
-    "## Training Script\n",
-    "\n",
-    "We are going to use the `run_clm.py` example script that you can find [here](https://github.com/huggingface/optimum-habana/blob/main/examples/language-modeling/run_clm.py). It performs the following:\n",
-    "- download and preprocess the dataset,\n",
-    "- instantiate the model by downloading a pre-trained checkpoint or initializing a new one,\n",
-    "- download a tokenizer,\n",
-    "- model training\n",
-    "- model evaluation\n",
-    "\n",
-    "It enables to **fine-tune** or **pre-train** a model.\n",
-    "\n",
-    "> The only difference with the `run_clm.py` example script of Transformers is that the `Trainer` and the `TrainingArguments` classes have been replaced by `GaudiTrainer` and `GaudiTrainingArguments` respectively."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c82cf40a",
-   "metadata": {},
-   "source": [
-    "## Dataset\n",
-    "\n",
-    "The **WikiText** language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia.\n",
-    "\n",
-    "It is available on the Hugging Face Hub and you cand find more information about it [here](https://huggingface.co/datasets/wikitext)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4022996f",
-   "metadata": {},
-   "source": [
-    "## 1. Install Dependencies\n",
-    "\n",
-    "We first install the latest version of Optimum Habana:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5edf1a81",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install optimum-habana"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d72dd346",
-   "metadata": {},
-   "source": [
-    "Let's also install the required libraries to run this example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b32c8253",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install datasets sentencepiece protobuf scikit-learn evaluate"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "685c94c7",
-   "metadata": {},
-   "source": [
-    "## 2. Fine-tuning GPT2-XL on 8 HPUs\n",
-    "\n",
-    "### Training Arguments\n",
-    "\n",
-    "Let's specify the training arguments the same way as in Transformers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d97d4e32",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "training_args = {\n",
-    "    \"output_dir\": \"/tmp/clm_gpt2_xl\",\n",
-    "    \"dataset_name\": \"wikitext\",\n",
-    "    \"dataset_config_name\": \"wikitext-2-raw-v1\",\n",
-    "    \"num_train_epochs\": 1,\n",
-    "    \"per_device_train_batch_size\": 4,\n",
-    "    \"per_device_eval_batch_size\": 4,\n",
-    "    \"gradient_checkpointing\": True,\n",
-    "    \"do_train\": True,\n",
-    "    \"do_eval\": True,\n",
-    "    \"overwrite_output_dir\": True,\n",
-    "    \"use_cache\": False,\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b91f0132",
-   "metadata": {},
-   "source": [
-    "Decide below whether you want to run pre-training or fine-tuning:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b5385bd6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pretraining = False\n",
-    "model_name = \"gpt2-xl\"\n",
-    "\n",
-    "if pretraining:\n",
-    "    training_args[\"config_name\"] = model_name\n",
-    "    training_args[\"tokenizer_name\"] = model_name\n",
-    "else:\n",
-    "    training_args[\"model_name_or_path\"] = model_name"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6b7218bb",
-   "metadata": {},
-   "source": [
-    "And finally the Gaudi-related arguments:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a2c19e02",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "training_args[\"use_habana\"] = True  # Whether to use HPUs or not\n",
-    "training_args[\"use_lazy_mode\"] = True  # Whether to use lazy or eager mode\n",
-    "training_args[\"gaudi_config_name\"] = \"Habana/gpt2\"  # Gaudi configuration to use\n",
-    "training_args[\"throughput_warmup_steps\"] = 3  # Remove the first N training iterations from throughput computation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3a2d9bba",
-   "metadata": {},
-   "source": [
-    "All the existing Gaudi configurations are [here](https://huggingface.co/habana). You can also create your own Gaudi configuration and upload it to the Hugging Face Hub!"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1bb8b26b",
-   "metadata": {},
-   "source": [
-    "### Running the Script\n",
-    "\n",
-    "We are going to leverage the `DistributedRunner` class to launch a distributed training. This could also be done with the [`gaudi_spawn.py`](https://github.com/huggingface/optimum-habana/blob/main/examples/gaudi_spawn.py) script. More information [here](https://huggingface.co/docs/optimum/habana/usage_guides/distributed).\n",
-    "\n",
-    "To be initialized, an instance of this class requires the command to execute and the number of devices to use. Since one Gaudi has 8 HPUs, we are going to use all of them.\n",
-    "\n",
-    "> **Disclaimer: the run below will fail!**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "feaba9f9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from optimum.habana.distributed import DistributedRunner\n",
-    "\n",
-    "# Build the command to execute\n",
-    "training_args_command_line = \" \".join(f\"--{key} {value}\" for key, value in training_args.items())\n",
-    "command = f\"../examples/language-modeling/run_clm.py {training_args_command_line}\"\n",
-    "\n",
-    "# # Instantiate a distributed runner\n",
-    "# distributed_runner = DistributedRunner(\n",
-    "#     command_list=[command],  # The command(s) to execute\n",
-    "#     world_size=8,            # The number of HPUs\n",
-    "#     use_mpi=True,            # OpenMPI is used for multi-processing\n",
-    "# )\n",
-    "\n",
-    "# # Launch training\n",
-    "# ret_code = distributed_runner.run()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "db6f1f75",
-   "metadata": {},
-   "source": [
-    "This run failed because it was too big to fit in HPUs memory... Let's use DeepSpeed to solve this!"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "02862b85",
-   "metadata": {},
-   "source": [
-    "## 3. DeepSpeed for HPUs\n",
-    "\n",
-    "It is possible to use DeepSpeed with HPUs to train larger models! This will enable to spread the optimizer states and gradients accross processes to use less memory.\n",
-    "\n",
-    "How to switch to distributed training with DeepSpeed:\n",
-    "1. Install Habana DeepSpeed.\n",
-    "2. Add one training argument to specify the DeepSpeed configuration to use.\n",
-    "3. Instantiate a new distributed runner.\n",
-    "\n",
-    "Let's install Habana DeepSpeed:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6a153780",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "08dcd80f",
-   "metadata": {},
-   "source": [
-    "We need a DeepSpeed configuration. We are going to use [this one](https://github.com/huggingface/optimum-habana/tree/main/notebooks/configs/deepspeed_zero_2.json)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "428b9504",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "training_args[\"deepspeed\"] = \"configs/deepspeed_zero_2.json\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "28aa0fd6",
-   "metadata": {},
-   "source": [
-    "We now have to instantiate a new distributed runner and to run it:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b6d116e5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Build the command to execute\n",
-    "training_args_command_line = \" \".join(f\"--{key} {value}\" for key, value in training_args.items())\n",
-    "command = f\"../examples/language-modeling/run_clm.py {training_args_command_line}\"\n",
-    "\n",
-    "# Instantiate a distributed runner\n",
-    "distributed_runner = DistributedRunner(\n",
-    "    command_list=[command],  # The command(s) to execute\n",
-    "    world_size=8,  # The number of HPUs\n",
-    "    use_deepspeed=True,  # Enable DeepSpeed\n",
-    ")\n",
-    "\n",
-    "# Launch training\n",
-    "ret_code = distributed_runner.run()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cc1222ea",
-   "metadata": {},
-   "source": [
-    "Let's try the model we just fine-tuned!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "572f2849",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# The sequence to complete\n",
-    "prompt_text = \"This live AI webinar is organized by Habana Labs and Hugging Face and\"\n",
-    "\n",
-    "import torch\n",
-    "import habana_frameworks.torch.hpu\n",
-    "\n",
-    "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n",
-    "\n",
-    "\n",
-    "path_to_model = training_args[\"output_dir\"]  # the folder where everything related to our run was saved\n",
-    "\n",
-    "device = torch.device(\"hpu\")\n",
-    "\n",
-    "# Load the tokenizer and the model\n",
-    "tokenizer = GPT2Tokenizer.from_pretrained(path_to_model)\n",
-    "model = GPT2LMHeadModel.from_pretrained(path_to_model)\n",
-    "model.to(device)\n",
-    "\n",
-    "# Encode the prompt\n",
-    "encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors=\"pt\")\n",
-    "encoded_prompt = encoded_prompt.to(device)\n",
-    "\n",
-    "# Generate the following of the prompt\n",
-    "output_sequences = model.generate(\n",
-    "    input_ids=encoded_prompt,\n",
-    "    max_length=16 + len(encoded_prompt[0]),\n",
-    "    do_sample=True,\n",
-    "    num_return_sequences=3,\n",
-    ")\n",
-    "\n",
-    "# Remove the batch dimension when returning multiple sequences\n",
-    "if len(output_sequences.shape) > 2:\n",
-    "    output_sequences.squeeze_()\n",
-    "\n",
-    "generated_sequences = []\n",
-    "\n",
-    "for generated_sequence_idx, generated_sequence in enumerate(output_sequences):\n",
-    "    print(f\"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===\")\n",
-    "    generated_sequence = generated_sequence.tolist()\n",
-    "\n",
-    "    # Decode text\n",
-    "    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)\n",
-    "\n",
-    "    # Remove all text after the stop token\n",
-    "    text = text[: text.find(\".\")]\n",
-    "\n",
-    "    # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing\n",
-    "    total_sequence = prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]\n",
-    "\n",
-    "    generated_sequences.append(total_sequence)\n",
-    "    print(total_sequence)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "23f74a31",
-   "metadata": {},
-   "source": [
-    "And here are the costs for 3 epochs with Gaudi and with Nvidia V100:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "71284cfa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "gaudi_price_per_hour = 13.10904\n",
-    "v100_price_per_hour = 12.24\n",
-    "\n",
-    "print(\n",
-    "    f\"Gaudi    (dl1.24xlarge): training time = 630s, cost = {np.round(630 * gaudi_price_per_hour / 3600, 2)}$ ({gaudi_price_per_hour}$/hr)\"\n",
-    ")\n",
-    "print(\n",
-    "    f\"4 x V100 (p3.8xlarge)  : training time = 858s, cost = {np.round(858 * v100_price_per_hour / 3600, 2)}$ ({v100_price_per_hour}$/hr)\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6316f65c",
-   "metadata": {},
-   "source": [
-    "We successfully trained GPT2-XL which has 1.6 billion parameters.\n",
-    "You can train even bigger models with Gaudi and DeepSpeed, try it now! More information is available in [the documentation of Optimum Habana](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed)."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/configs/deepspeed_zero_2.json b/notebooks/configs/deepspeed_zero_2.json
deleted file mode 100644
index 5d5b80af99..0000000000
--- a/notebooks/configs/deepspeed_zero_2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "steps_per_print": 1,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
diff --git a/optimum/habana/__init__.py b/optimum/habana/__init__.py
deleted file mode 100644
index b936d5fbbc..0000000000
--- a/optimum/habana/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-#  Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from .transformers import (
-    GaudiConfig,
-    GaudiSeq2SeqTrainer,
-    GaudiSeq2SeqTrainingArguments,
-    GaudiTrainer,
-    GaudiTrainingArguments,
-)
-from .utils import check_synapse_version
-from .version import __version__
-
-check_synapse_version()
diff --git a/optimum/habana/accelerate/__init__.py b/optimum/habana/accelerate/__init__.py
deleted file mode 100644
index 7045124d9a..0000000000
--- a/optimum/habana/accelerate/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .accelerator import GaudiAccelerator
-from .state import GaudiAcceleratorState, GaudiPartialState
diff --git a/optimum/habana/accelerate/accelerator.py b/optimum/habana/accelerate/accelerator.py
deleted file mode 100644
index 56f16386db..0000000000
--- a/optimum/habana/accelerate/accelerator.py
+++ /dev/null
@@ -1,856 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import contextlib
-import functools
-import math
-import os
-import sys
-import warnings
-from collections import OrderedDict
-from contextlib import contextmanager
-from dataclasses import make_dataclass
-from types import MethodType
-
-import torch
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.scheduler import AcceleratedScheduler
-from accelerate.state import GradientState
-from accelerate.tracking import GeneralTracker, filter_trackers
-from accelerate.utils import (
-    AutocastKwargs,
-    DeepSpeedPlugin,
-    DistributedDataParallelKwargs,
-    DistributedType,
-    GradientAccumulationPlugin,
-    GradScalerKwargs,
-    InitProcessGroupKwargs,
-    KwargsHandler,
-    LoggerType,
-    MegatronLMPlugin,
-    PrecisionType,
-    ProjectConfiguration,
-    RNGType,
-    check_os_kernel,
-    convert_outputs_to_fp32,
-    is_deepspeed_available,
-    is_torch_version,
-    parse_choice_from_env,
-)
-from accelerate.utils.constants import FSDP_PYTORCH_VERSION
-from accelerate.utils.operations import _gpu_gather
-from accelerate.utils.other import is_compiled_module
-from torch.optim.lr_scheduler import LRScheduler
-
-
-if is_deepspeed_available():
-    from accelerate.utils import (
-        DeepSpeedEngineWrapper,
-        DeepSpeedOptimizerWrapper,
-        DeepSpeedSchedulerWrapper,
-        DummyOptim,
-        DummyScheduler,
-    )
-
-from .data_loader import gaudi_prepare_data_loader
-from .state import GaudiAcceleratorState, GaudiPartialState
-from .utils import (
-    GaudiDistributedType,
-    GaudiDynamoBackend,
-    GaudiFP8RecipeKwargs,
-    GaudiFullyShardedDataParallelPlugin,
-    GaudiTorchDynamoPlugin,
-    convert_model,
-    get_fp8_recipe,
-)
-
-
-logger = get_logger(__name__)
-
-
-class GaudiAccelerator(Accelerator):
-    """
-    Adapted from: https://github.com/huggingface/accelerate/blob/8514c35192ac9762920f1ab052e5cea4c0e46eeb/src/accelerate/accelerator.py#L145
-    """
-
-    def __init__(
-        self,
-        device_placement: bool = True,
-        split_batches: bool = False,
-        mixed_precision: PrecisionType | str | None = None,
-        gradient_accumulation_steps: int = 1,
-        cpu: bool = False,
-        deepspeed_plugin: DeepSpeedPlugin | None = None,
-        fsdp_plugin: GaudiFullyShardedDataParallelPlugin | None = None,
-        megatron_lm_plugin: MegatronLMPlugin | None = None,
-        rng_types: list[str | RNGType] | None = None,
-        log_with: str | LoggerType | GeneralTracker | list[str | LoggerType | GeneralTracker] | None = None,
-        project_dir: str | os.PathLike | None = None,
-        project_config: ProjectConfiguration | None = None,
-        gradient_accumulation_plugin: GradientAccumulationPlugin | None = None,
-        dispatch_batches: bool | None = None,
-        even_batches: bool = True,
-        use_seedable_sampler: bool = False,
-        step_scheduler_with_optimizer: bool = True,
-        kwargs_handlers: list[KwargsHandler] | None = None,
-        dynamo_backend: GaudiDynamoBackend | str | None = None,
-        distribution_strategy: str = None,
-        force_autocast: bool = False,
-        fp8_recipe_format: str = None,
-    ):
-        self.trackers = []
-        if project_config is not None:
-            self.project_configuration = project_config
-        else:
-            self.project_configuration = ProjectConfiguration(project_dir=project_dir)
-        if project_dir is not None and self.project_dir is None:
-            self.project_configuration.set_directories(project_dir)
-        if mixed_precision is not None:
-            mixed_precision = str(mixed_precision)
-            if mixed_precision not in PrecisionType:
-                raise ValueError(
-                    f"Unknown mixed_precision mode: {mixed_precision}. Choose between {PrecisionType.list()}"
-                )
-            elif mixed_precision == "fp16":
-                raise ValueError("fp16 is not supported on Habana Gaudi.")
-
-        dynamo_plugin = (
-            GaudiTorchDynamoPlugin() if dynamo_backend is None else GaudiTorchDynamoPlugin(backend=dynamo_backend)
-        )
-
-        if deepspeed_plugin is None:  # init from env variables
-            deepspeed_plugin = (
-                DeepSpeedPlugin() if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true" else None
-            )
-        else:
-            if not isinstance(deepspeed_plugin, DeepSpeedPlugin):
-                raise TypeError("`deepspeed_plugin` must be an `accelerate.utils.DeepSpeedPlugin` object.")
-            os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"  # use DeepSpeed if plugin is provided
-        if deepspeed_plugin:
-            if not is_deepspeed_available():
-                raise ImportError(
-                    "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0`."
-                )
-
-            mixed_precision = (
-                os.environ.get("ACCELERATE_MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision
-            )
-            deepspeed_plugin.set_mixed_precision(mixed_precision)
-            deepspeed_plugin.set_deepspeed_weakref()
-
-        if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true" or isinstance(
-            fsdp_plugin, GaudiFullyShardedDataParallelPlugin
-        ):
-            import importlib.metadata
-
-            torch_version = importlib.metadata.version("torch")
-            torch_version = torch_version[5:]
-            if is_torch_version("<", FSDP_PYTORCH_VERSION + torch_version):
-                raise ValueError(f"FSDP requires PyTorch >= {FSDP_PYTORCH_VERSION}")
-
-        if fsdp_plugin is None:  # init from env variables
-            fsdp_plugin = (
-                GaudiFullyShardedDataParallelPlugin()
-                if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true"
-                else None
-            )
-        else:
-            if not isinstance(fsdp_plugin, GaudiFullyShardedDataParallelPlugin):
-                raise TypeError("`fsdp_plugin` must be a GaudiFullyShardedDataParallelPlugin object.")
-            os.environ["ACCELERATE_USE_FSDP"] = "true"  # use FSDP if plugin is provided
-
-        # Kwargs handlers
-        self.ddp_handler = None
-        self.scaler_handler = None
-        self.init_handler = None
-        self.fp8_recipe_handler = None
-        self.fp8_recipe_format = None
-        self.autocast_handler = None
-        if kwargs_handlers is not None:
-            for handler in kwargs_handlers:
-                assert isinstance(
-                    handler, KwargsHandler
-                ), f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
-                if isinstance(handler, DistributedDataParallelKwargs):
-                    if self.ddp_handler is not None:
-                        raise ValueError("You can only pass one `DistributedDataParallelKwargs` in `kwargs_handler`.")
-                    else:
-                        self.ddp_handler = handler
-                elif isinstance(handler, GradScalerKwargs):
-                    if self.scaler_handler is not None:
-                        raise ValueError("You can only pass one `GradScalerKwargs` in `kwargs_handler`.")
-                    else:
-                        self.scaler_handler = handler
-                elif isinstance(handler, InitProcessGroupKwargs):
-                    if self.init_handler is not None:
-                        raise ValueError("You can only pass one `InitProcessGroupKwargs` in `kwargs_handler`.")
-                    else:
-                        self.init_handler = handler
-                elif isinstance(handler, GaudiFP8RecipeKwargs):
-                    if self.fp8_recipe_handler is not None:
-                        raise ValueError("You can only pass one `GaudiFP8RecipeKwargs` in `kwargs_handler`.")
-                    else:
-                        self.fp8_recipe_handler = handler
-                elif isinstance(handler, AutocastKwargs):
-                    if self.autocast_handler is not None:
-                        raise ValueError("You can only pass one `AutocastKwargs` in `kwargs_handler`.")
-                    else:
-                        self.autocast_handler = handler
-
-        kwargs = self.init_handler.to_kwargs() if self.init_handler is not None else {}
-        self.state = GaudiAcceleratorState(
-            mixed_precision=mixed_precision,
-            cpu=cpu,
-            dynamo_plugin=dynamo_plugin,
-            deepspeed_plugin=deepspeed_plugin,
-            fsdp_plugin=fsdp_plugin,
-            megatron_lm_plugin=megatron_lm_plugin,
-            _from_accelerator=True,
-            **kwargs,
-        )
-
-        if self.state.is_fp8_enabled:
-            if self.fp8_recipe_handler is None:
-                self.fp8_recipe_handler = GaudiFP8RecipeKwargs()
-            # Handling FP8 recipe creation in init since both `prepare_model` and `_prepare_deepspeed` require it.
-            # (Base accelerator handles this in `prepare_model` function)
-            self.fp8_recipe_handler = get_fp8_recipe(self.fp8_recipe_handler)
-
-        trackers = filter_trackers(log_with, self.logging_dir)
-        if len(trackers) < 1 and log_with is not None:
-            warnings.warn(f"`log_with={log_with}` was passed but no supported trackers are currently installed.")
-        self.log_with = trackers
-
-        if (
-            (mixed_precision != "bf16")
-            and getattr(self.state, "downcast_bfloat", False)
-            and (self.state.distributedType != DistributedType.TPU)
-        ):
-            raise ValueError("Can only use `downcast_bf16` when using `mixed_precision='bf16'` and on a TPU")
-
-        if gradient_accumulation_plugin is not None:
-            if gradient_accumulation_steps != 1:
-                raise ValueError(
-                    "You can only pass one of `gradient_accumulation_steps` and `gradient_accumulation_plugin`. Please only pass in the created `GradientAccumulationPlugin` object."
-                )
-        else:
-            gradient_accumulation_steps = int(
-                parse_choice_from_env("ACCELERATE_GRADIENT_ACCUMULATION_STEPS", gradient_accumulation_steps)
-            )
-            gradient_accumulation_plugin = GradientAccumulationPlugin(num_steps=gradient_accumulation_steps)
-        self.gradient_state = GradientState(
-            gradient_accumulation_plugin=gradient_accumulation_plugin,
-        )
-
-        self.device_placement = device_placement
-        self.split_batches = split_batches
-        self.dispatch_batches = dispatch_batches
-        self.even_batches = even_batches
-        self.use_seedable_sampler = use_seedable_sampler
-        self.step_scheduler_with_optimizer = step_scheduler_with_optimizer
-
-        # Mixed precision attributes
-        self.scaler = None
-        self.native_amp = self.state.mixed_precision == "bf16"
-
-        # Start of internal step tracking
-        self.step = 0
-
-        # Internal references to the training objects
-        self._optimizers = []
-        self._models = []
-        self._schedulers = []
-        self._dataloaders = []
-        self._custom_objects = []
-
-        # Hooks
-        self._load_model_state_pre_hook = OrderedDict()
-        self._save_model_state_pre_hook = OrderedDict()
-
-        # RNG Types
-        self.rng_types = rng_types
-        if self.rng_types is None:
-            self.rng_types = ["generator"]
-
-        # Set a flag tensor for early stopping and other breakpoints
-        self.flag_tensor = None
-
-        self._distribution_strategy = distribution_strategy
-
-        self.force_autocast = force_autocast
-
-        check_os_kernel()
-
-    @property
-    def use_fp16(self):
-        raise ValueError("fp16 is not supported on Habana Gaudi.")
-
-    def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, evaluation_mode: bool = False):
-        """
-        Prepares a PyTorch model for training in any distributed setup. It is recommended to use
-        [`Accelerator.prepare`] instead.
-
-        Args:
-            model (`torch.nn.Module`):
-                A PyTorch model to prepare. You don't need to prepare a model if it is used only for inference without
-                any kind of mixed precision
-            device_placement (`bool`, *optional*):
-                Whether or not to place the model on the proper device. Will default to `self.device_placement`.
-            evaluation_mode (`bool`, *optional*, defaults to `False`):
-                Whether or not to set the model for evaluation only, by just applying mixed precision and
-                `torch.compile` (if configured in the `Accelerator` object).
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> # Assume a model is defined
-        >>> model = accelerator.prepare_model(model)
-        ```
-        """
-        if device_placement is None:
-            device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP
-            if not evaluation_mode and self.distributed_type == GaudiDistributedType.MULTI_HPU:
-                device_placement = None
-        self._models.append(model)
-
-        # TODO: Look at enabling native TP training directly with a proper config
-        if (
-            self.verify_device_map(model)
-            and self.distributed_type != DistributedType.NO
-            and os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true"
-        ):
-            raise ValueError(
-                "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
-                " Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`."
-            )
-
-        # The following block is executed only when force_autocast is True
-        # because forward+backward+loss is already wrapped with autocast in Trainer
-        if self.native_amp and self.force_autocast:
-            model._original_forward = model.forward
-            model_forward_func = model.forward.__func__ if hasattr(model.forward, "__func__") else model.forward
-            new_forward = torch.autocast(device_type=self.state.device.type, dtype=torch.bfloat16)(model_forward_func)
-            if hasattr(model.forward, "__func__"):
-                model.forward = MethodType(new_forward, model)
-                model.forward = MethodType(convert_outputs_to_fp32(model.forward.__func__), model)
-            else:
-                model.forward = convert_outputs_to_fp32(new_forward)
-        if self.state.is_fp8_enabled:
-            model = convert_model(model)
-
-        if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
-            model, "hf_device_map", False
-        ):
-            model_devices = set(model.hf_device_map.values())
-            if len(model_devices) > 1 and self.distributed_type != DistributedType.NO:
-                raise ValueError(
-                    "You can't train a model that has been loaded in 8-bit precision on multiple devices in any distributed mode."
-                    " In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
-                    " Therefore you should not specify that you are under any distributed regime in your accelerate config."
-                )
-            current_device = list(model_devices)[0]
-            current_device_index = current_device.index if isinstance(current_device, torch.device) else current_device
-
-            if torch.device(current_device_index) != self.device:
-                # if on the first device (GPU 0) we don't care
-                if (self.device.index is not None) or (current_device_index != 0):
-                    raise ValueError(
-                        "You can't train a model that has been loaded in 8-bit precision on a different device than the one "
-                        "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
-                    )
-
-            if "cpu" in model_devices or "disk" in model_devices:
-                raise ValueError(
-                    "You can't train a model that has been loaded in 8-bit precision with CPU or disk offload."
-                )
-        elif device_placement and not self.verify_device_map(model):
-            model = model.to(self.device)
-        if not evaluation_mode:
-            if self.distributed_type == GaudiDistributedType.MULTI_HPU and self._distribution_strategy != "fast_ddp":
-                if any(p.requires_grad for p in model.parameters()):
-                    kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
-                    model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
-            elif self.distributed_type == GaudiDistributedType.FSDP:
-                from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
-
-                # Check if the model is already a FSDP model due to `Manual Wrapping` and if so,
-                # don't wrap it again
-                # In case the model is already compiled using PyTorch 2.0 and the wrapped model in it
-                # is a FSDP model, don't wrap it again
-                is_type_fsdp = isinstance(model, FSDP) or (
-                    is_compiled_module(model) and isinstance(model._orig_mod, FSDP)
-                )
-
-                if not is_type_fsdp:
-                    self.state.fsdp_plugin.set_auto_wrap_policy(model)
-                    fsdp_plugin = self.state.fsdp_plugin
-                    kwargs = {
-                        "sharding_strategy": fsdp_plugin.sharding_strategy,
-                        "cpu_offload": fsdp_plugin.cpu_offload,
-                        "auto_wrap_policy": fsdp_plugin.auto_wrap_policy,
-                        "mixed_precision": fsdp_plugin.mixed_precision_policy,
-                        "sync_module_states": fsdp_plugin.sync_module_states,
-                        "backward_prefetch": fsdp_plugin.backward_prefetch,
-                        "forward_prefetch": fsdp_plugin.forward_prefetch,
-                        "use_orig_params": fsdp_plugin.use_orig_params,
-                        "param_init_fn": fsdp_plugin.param_init_fn,
-                        "ignored_modules": fsdp_plugin.ignored_modules,
-                        "limit_all_gathers": fsdp_plugin.limit_all_gathers,
-                        "device_id": torch.device("hpu", torch.hpu.current_device()),
-                    }
-                    model = FSDP(model, **kwargs)
-                    if fsdp_plugin.activation_checkpointing:
-                        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-                            CheckpointImpl,
-                            apply_activation_checkpointing,
-                            checkpoint_wrapper,
-                        )
-
-                        apply_activation_checkpointing(
-                            model,
-                            checkpoint_wrapper_fn=functools.partial(
-                                checkpoint_wrapper,
-                                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-                            ),
-                            auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
-                        )
-                # if the previous and current models are same, delete the previous one
-                if len(self._models) > 1 and (self._models[-2] is self._models[-1]):
-                    del self._models[-2]
-                self._models[-1] = model
-        # torch.compile should be called last and only if the model isn't already compiled.
-        if self.state.dynamo_plugin.backend != GaudiDynamoBackend.NO and not is_compiled_module(model):
-            model = torch.compile(model, **self.state.dynamo_plugin.to_kwargs())
-        return model
-
-    def _prepare_deepspeed(self, *args):
-        import deepspeed
-
-        deepspeed_plugin = self.state.deepspeed_plugin
-
-        is_dataloader_present = any(isinstance(obj, torch.utils.data.DataLoader) for obj in args)
-        result = [
-            self._prepare_one(obj, first_pass=True)
-            if isinstance(obj, torch.utils.data.DataLoader)
-            else convert_model(obj)
-            if isinstance(obj, torch.nn.Module) and self.state.is_fp8_enabled
-            else obj
-            for obj in args
-        ]
-
-        if deepspeed_plugin.is_auto("train_micro_batch_size_per_gpu"):
-            if is_dataloader_present:
-                batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
-                if any(bs is None for bs in batch_sizes):
-                    raise ValueError(
-                        "At least one of the dataloaders passed to `accelerate.prepare()` has `None` as batch size. "
-                        "Please set an integer value in `train_micro_batch_size_per_gpu` in the deepspeed config file "
-                        "or assign integer value to `AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu']`."
-                    )
-                if self.split_batches:
-                    batch_sizes = [batch_size // self.num_processes for batch_size in batch_sizes]
-
-                batch_size_per_device = min(batch_sizes) if deepspeed_plugin.is_train_batch_min else max(batch_sizes)
-                if len(batch_sizes) > 1:
-                    logger.info(
-                        "Since you passed both train and evaluation dataloader, `is_train_batch_min` (here "
-                        f"{deepspeed_plugin.is_train_batch_min} will decide the `train_batch_size` ({batch_size_per_device})."
-                    )
-            else:
-                raise ValueError(
-                    "When using DeepSpeed, `accelerate.prepare()` requires you to pass at least one of training or evaluation dataloaders "
-                    "with `batch_size` attribute returning an integer value "
-                    "or alternatively set an integer value in `train_micro_batch_size_per_gpu` in the deepspeed config file "
-                    "or assign integer value to `AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu']`."
-                )
-        else:
-            batch_size_per_device = deepspeed_plugin.get_value("train_micro_batch_size_per_gpu")
-
-        # handle `gradient_accumulation_steps` when the value is `auto`
-        deepspeed_plugin.fill_match(
-            "gradient_accumulation_steps",
-            must_match=False,
-            gradient_accumulation_steps=self.gradient_accumulation_steps,
-        )
-
-        config_kwargs = {
-            "train_micro_batch_size_per_gpu": batch_size_per_device,
-            "train_batch_size": batch_size_per_device
-            * deepspeed_plugin.get_value("gradient_accumulation_steps")
-            * self.num_processes,
-            "gradient_clipping": 1.0,
-            "zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
-        }
-
-        model = None
-        optimizer = None
-        scheduler = None
-        for obj in result:
-            if isinstance(obj, torch.nn.Module):
-                model = obj
-            elif isinstance(obj, (torch.optim.Optimizer, DummyOptim)):
-                optimizer = obj
-            elif (isinstance(obj, (LRScheduler, DummyScheduler))) or (
-                type(obj).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES
-            ):
-                scheduler = obj
-
-        if optimizer is not None:
-            if "optimizer" in deepspeed_plugin.deepspeed_config and not isinstance(optimizer, (DummyOptim)):
-                raise ValueError(
-                    "You cannot specify an optimizer in the config file and in the code at the same time. "
-                    "Please remove the optimizer from the config file or "
-                    "create `accelerate.utils.DummyOptim` in the code."
-                )
-            elif "optimizer" not in deepspeed_plugin.deepspeed_config and isinstance(optimizer, (DummyOptim)):
-                raise ValueError(
-                    "You cannot create a `DummyOptim` without specifying an optimizer in the config file."
-                )
-
-            if isinstance(optimizer, (torch.optim.Optimizer)):
-                deepspeed_plugin.deepspeed_config["zero_allow_untested_optimizer"] = True
-
-        if scheduler is not None:
-            if "scheduler" in deepspeed_plugin.deepspeed_config and not isinstance(scheduler, (DummyScheduler)):
-                raise ValueError(
-                    "You cannot specify a scheduler in the config file and in the code at the same time. "
-                    "Please remove the scheduler from the config file or "
-                    "create `accelerate.utils.DummyScheduler` in the code."
-                )
-            elif (
-                "scheduler" not in deepspeed_plugin.deepspeed_config
-                and isinstance(scheduler, (DummyScheduler))
-                and scheduler.lr_scheduler_callable is None
-            ):
-                raise ValueError(
-                    "Either specify a scheduler in the config file or "
-                    "pass in the `lr_scheduler_callable` parameter when using `accelerate.utils.DummyScheduler`."
-                )
-
-        if optimizer is not None and scheduler is not None:
-            if isinstance(optimizer, (DummyOptim)) and not isinstance(scheduler, (DummyScheduler)):
-                raise ValueError(
-                    "You can only specify `accelerate.utils.DummyScheduler` in the code when using "
-                    "`accelerate.utils.DummyOptim`."
-                )
-
-        if model is not None:
-            # deal with config keys that use `auto` value and rely on model's hidden_size
-            hidden_size_based_keys = [
-                "zero_optimization.reduce_bucket_size",
-                "zero_optimization.stage3_prefetch_bucket_size",
-                "zero_optimization.stage3_param_persistence_threshold",
-            ]
-            hidden_size_auto_keys = [x for x in hidden_size_based_keys if deepspeed_plugin.is_auto(x)]
-            if len(hidden_size_auto_keys) > 0:
-                reasoning = (
-                    "therefore it's not possible to automatically fill out the following `auto` entries "
-                    + f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing "
-                    + "`auto` values for these keys with an integer value of your choice."
-                )
-                if not hasattr(model, "config"):
-                    raise ValueError("Can't find `model.config` entry, " + reasoning)
-
-                if hasattr(model.config, "hidden_size"):
-                    hidden_size = model.config.hidden_size
-                elif hasattr(model.config, "hidden_sizes"):
-                    # if there are many hidden sizes pick the largest one
-                    hidden_size = max(model.config.hidden_sizes)
-                else:
-                    raise ValueError(
-                        "Can find neither `model.config.hidden_size` nor `model.config.hidden_sizes`, " + reasoning
-                    )
-
-                config_kwargs.update(
-                    {
-                        "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
-                        "zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
-                        "zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
-                    }
-                )
-
-            if isinstance(optimizer, (DummyOptim)):
-                config_kwargs.update(
-                    {"optimizer.params.lr": optimizer.lr, "optimizer.params.weight_decay": optimizer.weight_decay}
-                )
-            if isinstance(scheduler, (DummyScheduler)) and scheduler.lr_scheduler_callable is None:
-                max_lr = (
-                    getattr(scheduler.optimizer, "lr", None)
-                    if getattr(scheduler.optimizer, "defaults", None) is None
-                    else scheduler.optimizer.defaults["lr"]
-                )
-                config_kwargs.update(
-                    {
-                        "scheduler.params.warmup_min_lr": 0,
-                        "scheduler.params.warmup_max_lr": max_lr,
-                        "scheduler.params.warmup_num_steps": scheduler.warmup_num_steps,
-                    }
-                )
-                if scheduler.total_num_steps is not None:
-                    config_kwargs["scheduler.params.total_num_steps"] = (
-                        math.ceil(scheduler.total_num_steps / self.num_processes)
-                        if not self.split_batches
-                        else scheduler.total_num_steps
-                    )
-            deepspeed_plugin.deepspeed_config_process(must_match=False, **config_kwargs)
-            self.deepspeed_config = deepspeed_plugin.deepspeed_config
-            kwargs = {"model": model, "config_params": self.deepspeed_config}
-            if optimizer is not None:
-                if isinstance(optimizer, (DummyOptim)):
-                    kwargs["model_parameters"] = optimizer.params
-                    if isinstance(scheduler, (DummyScheduler)) and scheduler.lr_scheduler_callable is not None:
-                        kwargs["lr_scheduler"] = scheduler.lr_scheduler_callable
-                else:
-                    if self.deepspeed_config["zero_optimization"].get("offload_optimizer", {}).get(
-                        "device", "none"
-                    ) != "none" and self.deepspeed_config.get("zero_force_ds_cpu_optimizer", True):
-                        from deepspeed.ops.adam import DeepSpeedCPUAdam
-
-                        defaults = {k: v for k, v in optimizer.defaults.items() if k in ["lr", "weight_decay"]}
-                        optimizer = DeepSpeedCPUAdam(optimizer.param_groups, **defaults)
-                    kwargs["optimizer"] = optimizer
-                    if scheduler is not None:
-                        if type(scheduler).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES:
-                            kwargs["lr_scheduler"] = scheduler
-
-            HabanaArgs = make_dataclass("HabanaArgs", [("use_hpu", bool), ("no_cuda", bool)])
-            habana_args = HabanaArgs(
-                use_hpu=True if self.device.type == "hpu" else False,
-                no_cuda=True if self.device.type == "cpu" else False,
-            )
-            if habana_args.use_hpu:
-                # This env variable is initialized here to make sure it is set to "true"
-                # It should be done by the launcher but it does not work for multi-node runs
-                os.environ["DEEPSPEED_USE_HPU"] = "true"
-
-            engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
-            # torch.compile should be called if dynamo plugin backend is set and only if the model isn't already compiled.
-            if self.state.dynamo_plugin.backend == GaudiDynamoBackend.HPU_BACKEND and not is_compiled_module(
-                kwargs["model"]
-            ):
-                engine.compile()
-            if optimizer is not None:
-                optimizer = DeepSpeedOptimizerWrapper(optimizer)
-            if scheduler is not None:
-                if lr_scheduler is None:
-                    scheduler = AcceleratedScheduler(
-                        scheduler,
-                        optimizer,
-                        step_with_optimizer=self.step_scheduler_with_optimizer,
-                        split_batches=self.split_batches,
-                    )
-                else:
-                    scheduler = DeepSpeedSchedulerWrapper(lr_scheduler, optimizer)
-
-            for i in range(len(result)):
-                if isinstance(result[i], torch.nn.Module):
-                    result[i] = engine
-                elif isinstance(result[i], (torch.optim.Optimizer, DummyOptim)):
-                    result[i] = optimizer
-                elif (isinstance(result[i], (LRScheduler, DummyScheduler))) or (
-                    type(result[i]).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES
-                ):
-                    result[i] = scheduler
-            # pointing for deepspeed_engine_wrapped.backward()
-            self.deepspeed_engine_wrapped = DeepSpeedEngineWrapper(engine)
-            self._models.append(engine)
-            if optimizer is not None:
-                self._optimizers.append(optimizer)
-            if scheduler is not None:
-                self._schedulers.append(scheduler)
-            if len(self._models) > 1:
-                raise AssertionError(
-                    "You can't use same `Accelerator()` instance with multiple models when using DeepSpeed"
-                )
-        return tuple(result)
-
-    def prepare_data_loader(
-        self, data_loader: torch.utils.data.DataLoader, device_placement=None, slice_fn_for_dispatch=None
-    ):
-        """
-        Prepares a PyTorch DataLoader for training in any distributed setup. It is recommended to use
-        [`Accelerator.prepare`] instead.
-
-        Args:
-            data_loader (`torch.utils.data.DataLoader`):
-                A vanilla PyTorch DataLoader to prepare
-            device_placement (`bool`, *optional*):
-                Whether or not to place the batches on the proper device in the prepared dataloader. Will default to
-                `self.device_placement`.
-            slice_fn_for_dispatch (`Callable`, *optional*`):
-                If passed, this function will be used to slice tensors across `num_processes`. Will default to
-                [`~utils.slice_tensors`]. This argument is used only when `dispatch_batches` is set to `True` and will
-                be ignored otherwise.
-
-        Example:
-
-        ```python
-        >>> import torch
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> data_loader = torch.utils.data.DataLoader(...)
-        >>> data_loader = accelerator.prepare_data_loader(data_loader, device_placement=True)
-        ```
-        """
-        # Ensure we can't double wrap a DataLoader due to `find_batch_size`
-        if getattr(data_loader, "_is_accelerate_prepared", False):
-            if data_loader not in self._dataloaders:
-                self._dataloaders.append(data_loader)
-            return data_loader
-        if device_placement is None:
-            device_placement = self.device_placement
-        prepared_data_loader = gaudi_prepare_data_loader(
-            data_loader,
-            self.device,
-            num_processes=self.num_processes,
-            process_index=self.process_index,
-            split_batches=self.split_batches,
-            put_on_device=device_placement,
-            rng_types=self.rng_types.copy(),
-            dispatch_batches=self.dispatch_batches,
-            even_batches=self.even_batches,
-            slice_fn_for_dispatch=slice_fn_for_dispatch,
-            use_seedable_sampler=self.use_seedable_sampler,
-        )
-        self._dataloaders.append(prepared_data_loader)
-        return prepared_data_loader
-
-    def gather(self, tensor):
-        """
-        Gather the values in *tensor* across all processes and concatenate them on the first dimension. Useful to
-        regroup the predictions from all processes when doing evaluation.
-
-        Note:
-            This gather happens in all processes.
-
-        Args:
-            tensor (`torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`):
-                The tensors to gather across all processes.
-
-        Returns:
-            `torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`: The gathered tensor(s). Note that the
-            first dimension of the result is *num_processes* multiplied by the first dimension of the input tensors.
-
-        Example:
-
-        ```python
-        >>> # Assuming four processes
-        >>> import torch
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> process_tensor = torch.tensor([accelerator.process_index])
-        >>> gathered_tensor = accelerator.gather(process_tensor)
-        >>> gathered_tensor
-        tensor([0, 1, 2, 3])
-        ```
-        """
-        if GaudiPartialState().distributed_type in [
-            GaudiDistributedType.MULTI_HPU,
-            GaudiDistributedType.DEEPSPEED,
-            GaudiDistributedType.FSDP,
-        ]:
-            return _gpu_gather(tensor)
-        else:
-            return tensor
-
-    def get_state_dict(self, model, unwrap=True):
-        """
-        Returns the state dictionary of a model sent through [`Accelerator.prepare`] potentially without full
-        precision.
-
-        Args:
-            model (`torch.nn.Module`):
-                A PyTorch model sent through [`Accelerator.prepare`]
-            unwrap (`bool`, *optional*, defaults to `True`):
-                Whether to return the original underlying state_dict of `model` or to return the wrapped state_dict
-
-        Returns:
-            `dict`: The state dictionary of the model potentially without full precision.
-
-        Example:
-
-        ```python
-        >>> import torch
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator()
-        >>> net = torch.nn.Linear(2, 2)
-        >>> net = accelerator.prepare(net)
-        >>> state_dict = accelerator.get_state_dict(net)
-        ```
-        """
-
-        if self.distributed_type == DistributedType.DEEPSPEED:
-            if self.deepspeed_config["zero_optimization"]["stage"] == 3:
-                if model.zero_gather_16bit_weights_on_model_save():
-                    state_dict = model._zero3_consolidated_16bit_state_dict()
-                else:
-                    raise ValueError(
-                        "Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. "
-                        "To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or "
-                        "set `zero3_save_16bit_model` to True when using `accelerate config`. "
-                        "To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
-                    )
-            else:
-                from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
-
-                state_dict = clone_tensors_for_torch_save(self.unwrap_model(model).state_dict())
-        # copied from https://github.com/huggingface/accelerate/blob/6f05bbd41a179cc9a86238c7c6f3f4eded70fbd8/src/accelerate/accelerator.py#L3057
-        elif self.distributed_type == DistributedType.FSDP:
-            from torch.distributed.fsdp import FullStateDictConfig, StateDictType
-            from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-
-            full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, full_state_dict_config):
-                state_dict = model.state_dict()
-        else:
-            if unwrap:
-                model = self.unwrap_model(model)
-            state_dict = model.state_dict()
-
-        return state_dict
-
-    @contextmanager
-    def autocast(self, cache_enabled: bool = False):
-        """
-        Will apply automatic mixed-precision inside the block inside this context manager, if it is enabled. Nothing
-        different will happen otherwise.
-
-        Example:
-
-        ```python
-        >>> from accelerate import Accelerator
-
-        >>> accelerator = Accelerator(mixed_precision="fp16")
-        >>> with accelerator.autocast():
-        ...     train()
-        ```
-        """
-        if self.native_amp:
-            autocast_context = torch.autocast(device_type=self.state.device.type, dtype=torch.bfloat16)
-        else:
-            autocast_context = contextlib.nullcontext()
-
-        autocast_context.__enter__()
-        yield
-        autocast_context.__exit__(*sys.exc_info())
diff --git a/optimum/habana/accelerate/data_loader.py b/optimum/habana/accelerate/data_loader.py
deleted file mode 100644
index aa9f14d1b7..0000000000
--- a/optimum/habana/accelerate/data_loader.py
+++ /dev/null
@@ -1,426 +0,0 @@
-from typing import Callable, List, Optional, Union
-
-import torch
-from accelerate.data_loader import (
-    _PYTORCH_DATALOADER_KWARGS,
-    BatchSamplerShard,
-    DataLoaderDispatcher,
-    DataLoaderShard,
-    IterableDatasetShard,
-    SeedableRandomSampler,
-)
-from accelerate.state import GradientState
-from accelerate.utils import (
-    RNGType,
-    concatenate,
-    find_batch_size,
-    get_data_structure,
-    is_torch_version,
-    send_to_device,
-    slice_tensors,
-)
-from torch.utils.data import BatchSampler, DataLoader, IterableDataset
-
-from .state import GaudiAcceleratorState
-from .utils.operations import (
-    broadcast,
-    broadcast_object_list,
-    initialize_tensors,
-)
-
-
-class GaudiDataLoaderDispatcher(DataLoaderDispatcher, DataLoader):
-    """
-    Subclass of a PyTorch `DataLoader` that will iterate and preprocess on process 0 only, then dispatch on each
-    process their part of the batch.
-
-    Args:
-        split_batches (`bool`, *optional*, defaults to `False`):
-            Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
-            yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
-            `num_processes` batches at each iteration). Another way to see this is that the observed batch size will be
-            the same as the initial `dataloader` if this option is set to `True`, the batch size of the initial
-            `dataloader` multiplied by `num_processes` otherwise. Setting this option to `True` requires that the batch
-            size of the `dataloader` is a round multiple of `batch_size`.
-        skip_batches (`int`, *optional*, defaults to 0):
-            The number of batches to skip at the beginning of an iteration.
-
-    **Available attributes:**
-
-        - **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
-            Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
-            number of processes
-
-        - **total_dataset_length** (`int`) -- Total length of the inner dataset across all processes.
-    """
-
-    def __init__(
-        self, dataset, split_batches: bool = False, skip_batches=0, _drop_last: bool = False, slice_fn=None, **kwargs
-    ):
-        shuffle = False
-        if is_torch_version(">=", "1.11.0"):
-            from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe
-
-            # We need to save the shuffling state of the DataPipe
-            if isinstance(dataset, ShufflerIterDataPipe):
-                shuffle = dataset._shuffle_enabled
-        DataLoader.__init__(self, dataset, **kwargs)
-        self.split_batches = split_batches
-        if shuffle:
-            torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
-
-        self.gradient_state = GradientState()
-        self.state = GaudiAcceleratorState()
-        self._drop_last = _drop_last
-        self.skip_batches = skip_batches
-
-        self.slice_fn = slice_tensors if slice_fn is None else slice_fn
-        self.iteration = 0
-
-    def _fetch_batches(self, iterator):
-        batches, batch = None, None
-        # On process 0, we gather the batch to dispatch.
-        if self.state.process_index == 0:
-            try:
-                if self.split_batches:
-                    # One batch of the main iterator is dispatched and split.
-                    batch = next(iterator)
-                else:
-                    # num_processes batches of the main iterator are concatenated then dispatched and split.
-                    # We add the batches one by one so we have the remainder available when drop_last=False.
-                    batches = []
-                    for _ in range(self.state.num_processes):
-                        batches.append(next(iterator))
-                    try:
-                        batch = concatenate(batches, dim=0)
-                    except RuntimeError as e:
-                        raise RuntimeError(
-                            "You can't use batches of different size with `dispatch_batches=True` or when using an `IterableDataset`."
-                            "either pass `dispatch_batches=False` and have each process fetch its own batch "
-                            " or pass `split_batches=True`. By doing so, the main process will fetch a full batch and "
-                            "slice it into `num_processes` batches for each process."
-                        ) from e
-                # In both cases, we need to get the structure of the batch that we will broadcast on other
-                # processes to initialize the tensors with the right shape.
-                # data_structure, stop_iteration
-                batch_info = [get_data_structure(batch), False]
-            except StopIteration:
-                batch_info = [None, True]
-        else:
-            batch_info = [None, self._stop_iteration]
-        # This is inplace, so after this instruction, every process has the same `batch_info` as process 0.
-        broadcast_object_list(batch_info)
-        self._stop_iteration = batch_info[1]
-        if self._stop_iteration:
-            # If drop_last is False and split_batches is False, we may have a remainder to take care of.
-            if not self.split_batches and not self._drop_last:
-                if self.state.process_index == 0 and len(batches) > 0:
-                    batch = concatenate(batches, dim=0)
-                    batch_info = [get_data_structure(batch), False]
-                else:
-                    batch_info = [None, True]
-                broadcast_object_list(batch_info)
-        return batch, batch_info
-
-    def __iter__(self):
-        self.begin()
-        self.set_epoch(self.iteration)
-        main_iterator = None
-        if is_torch_version(">=", "2.0.1"):
-            # NOTE PyTorch DataLoader adds forward compatibilities for DataPipes, which broadcasts
-            # shared seed to all dist processes. Thus, we need to create iterator for all dist processes.
-            # But, we only iterate through the DataLoader on process 0.
-            main_iterator = DataLoader.__iter__(self)
-        elif self.state.process_index == 0:
-            main_iterator = DataLoader.__iter__(self)
-        stop_iteration = False
-        self._stop_iteration = False
-        first_batch = None
-        next_batch, next_batch_info = self._fetch_batches(main_iterator)
-        batch_index = 0
-        while not stop_iteration:
-            batch, batch_info = next_batch, next_batch_info
-
-            if self.state.process_index != 0:
-                # Initialize tensors on other processes than process 0.
-                batch = initialize_tensors(batch_info[0])
-            batch = send_to_device(batch, self.state.device)
-            # Broadcast the batch before splitting it.
-            batch = broadcast(batch, from_process=0)
-
-            if not self._drop_last and first_batch is None:
-                # We keep at least num processes elements of the first batch to be able to complete the last batch
-                first_batch = self.slice_fn(
-                    batch,
-                    slice(0, self.state.num_processes),
-                    process_index=self.state.process_index,
-                    num_processes=self.state.num_processes,
-                )
-
-            if batch is None:
-                raise ValueError(
-                    f"Batch does not contain any data (`{batch}`). At the end of all iterable data available before expected stop iteration."
-                )
-
-            observed_batch_size = find_batch_size(batch)
-            batch_size = observed_batch_size // self.state.num_processes
-
-            stop_iteration = self._stop_iteration
-            if not stop_iteration:
-                # We may still be at the end of the dataloader without knowing it yet: if there is nothing left in
-                # the dataloader since the number of batches is a round multiple of the number of processes.
-                next_batch, next_batch_info = self._fetch_batches(main_iterator)
-                # next_batch_info[0] is None when there are no more batches, otherwise we still need to process them.
-                if self._stop_iteration and next_batch_info[0] is None:
-                    stop_iteration = True
-
-            if not self._drop_last and stop_iteration and observed_batch_size % self.state.num_processes != 0:
-                # If the last batch is not complete, let's add the first batch to it.
-                batch = concatenate([batch, first_batch], dim=0)
-                # Batch size computation above is wrong, it's off by 1 so we fix it.
-                batch_size += 1
-
-            data_slice = slice(self.state.process_index * batch_size, (self.state.process_index + 1) * batch_size)
-            batch = self.slice_fn(
-                batch,
-                data_slice,
-                process_index=self.state.process_index,
-                num_processes=self.state.num_processes,
-            )
-
-            if stop_iteration:
-                self.end_of_dataloader = True
-                self.remainder = observed_batch_size
-            if batch_index >= self.skip_batches:
-                yield batch
-            batch_index += 1
-        self.iteration += 1
-        self.end()
-
-
-def gaudi_prepare_data_loader(
-    dataloader: DataLoader,
-    device: Optional[torch.device] = None,
-    num_processes: Optional[int] = None,
-    process_index: Optional[int] = None,
-    split_batches: bool = False,
-    put_on_device: bool = False,
-    rng_types: Optional[List[Union[str, RNGType]]] = None,
-    dispatch_batches: Optional[bool] = None,
-    even_batches: bool = True,
-    slice_fn_for_dispatch: Optional[Callable] = None,
-    use_seedable_sampler: bool = False,
-) -> DataLoader:
-    """
-    Wraps a PyTorch `DataLoader` to generate batches for one of the processes only.
-
-    Depending on the value of the `drop_last` attribute of the `dataloader` passed, it will either stop the iteration
-    at the first batch that would be too small / not present on all processes or loop with indices from the beginning.
-
-    Args:
-        dataloader (`torch.utils.data.dataloader.DataLoader`):
-            The data loader to split across several devices.
-        device (`torch.device`):
-            The target device for the returned `DataLoader`.
-        num_processes (`int`, *optional*):
-            The number of processes running concurrently. Will default to the value given by
-            [`GaudiAcceleratorState`].
-        process_index (`int`, *optional*):
-            The index of the current process. Will default to the value given by [`GaudiAcceleratorState`].
-        split_batches (`bool`, *optional*, defaults to `False`):
-            Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
-            yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
-            `num_processes` batches at each iteration).
-
-            Another way to see this is that the observed batch size will be the same as the initial `dataloader` if
-            this option is set to `True`, the batch size of the initial `dataloader` multiplied by `num_processes`
-            otherwise.
-
-            Setting this option to `True` requires that the batch size of the `dataloader` is a round multiple of
-            `batch_size`.
-        put_on_device (`bool`, *optional*, defaults to `False`):
-            Whether or not to put the batches on `device` (only works if the batches are nested list, tuples or
-            dictionaries of tensors).
-        rng_types (list of `str` or [`~utils.RNGType`]):
-            The list of random number generators to synchronize at the beginning of each iteration. Should be one or
-            several of:
-
-            - `"torch"`: the base torch random number generator
-            - `"cuda"`: the CUDA random number generator (GPU only)
-            - `"xla"`: the XLA random number generator (TPU only)
-            - `"generator"`: the `torch.Generator` of the sampler (or batch sampler if there is no sampler in your
-              dataloader) or of the iterable dataset (if it exists) if the underlying dataset is of that type.
-
-        dispatch_batches (`bool`, *optional*):
-            If set to `True`, the datalaoder prepared is only iterated through on the main process and then the batches
-            are split and broadcast to each process. Will default to `True` when the underlying dataset is an
-            `IterableDataset`, `False` otherwise.
-        even_batches (`bool`, *optional*, defaults to `True`):
-            If set to `True`, in cases where the total batch size across all processes does not exactly divide the
-            dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among
-            all workers.
-        slice_fn_for_dispatch (`Callable`, *optional*`):
-            If passed, this function will be used to slice tensors across `num_processes`. Will default to
-            [`~utils.slice_tensors`]. This argument is used only when `dispatch_batches` is set to `True` and will be
-            ignored otherwise.
-        use_seedable_sampler (`bool`, *optional*, defaults to `False`):
-            Whether to use the [`~data_loader.SeedableRandomSampler`] instead of a `RandomSampler` for better
-            reproducability. Comes at a cost of potentially different performances due to different shuffling
-            algorithms but ensures results will be the *exact* same.
-
-    Returns:
-        `torch.utils.data.dataloader.DataLoader`: A new data loader that will yield the portion of the batches
-
-    <Tip warning={true}>
-
-    `BatchSampler`s with varying batch sizes are not enabled by default. To enable this behaviour, set `even_batches`
-    equal to `False`
-
-    </Tip>
-    """
-    if dispatch_batches is None:
-        if not put_on_device:
-            dispatch_batches = False
-        else:
-            dispatch_batches = isinstance(dataloader.dataset, IterableDataset)
-
-    if dispatch_batches and not put_on_device:
-        raise ValueError("Using `dispatch_batches=True` requires `put_on_device=True`.")
-    # Grab defaults from GaudiAcceleratorState
-    state = GaudiAcceleratorState()
-    if num_processes is None:
-        num_processes = state.num_processes
-    if process_index is None:
-        process_index = state.process_index
-
-    # Sanity check
-    batch_size = dataloader.batch_size if dataloader.batch_size is not None else dataloader.batch_sampler.batch_size
-    if split_batches and batch_size > 1 and batch_size % num_processes != 0:
-        raise ValueError(
-            f"To use a `DataLoader` in `split_batches` mode, the batch size ({dataloader.batch_size}) "
-            f"needs to be a round multiple of the number of processes ({num_processes})."
-        )
-
-    new_dataset = dataloader.dataset
-    # Iterable dataset doesn't like batch_sampler, but data_loader creates a default one for it
-    new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
-    sampler_is_batch_sampler = False
-    synchronized_generator = None
-    sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
-    if sampler_is_batch_sampler:
-        sampler = dataloader.sampler.sampler
-    else:
-        sampler = dataloader.batch_sampler.sampler
-    # Commenting the block below as it makes the accuracy decrease quite a lot for a few models and tasks
-    # e.g. audio classification with Wav2Vec2 or Seq2SeqQA with T5
-    # if isinstance(sampler, RandomSampler) and use_seedable_sampler:
-    #     # When iterating through the dataloader during distributed processes
-    #     # we want to ensure that on each process we are iterating through the same
-    #     # samples in the same order if a seed is set. This requires a tweak
-    #     # to the `torch.utils.data.RandomSampler` class (if used).
-    #     sampler = SeedableRandomSampler(
-    #         data_source=sampler.data_source,
-    #         replacement=sampler.replacement,
-    #         num_samples=sampler._num_samples,
-    #         generator=getattr(sampler, "generator", torch.Generator()),
-    #     )
-
-    # No change if no multiprocess
-    if num_processes != 1 and not dispatch_batches:
-        if isinstance(new_dataset, IterableDataset):
-            if getattr(dataloader.dataset, "generator", None) is not None:
-                synchronized_generator = dataloader.dataset.generator
-            new_dataset = IterableDatasetShard(
-                new_dataset,
-                batch_size=dataloader.batch_size,
-                drop_last=dataloader.drop_last,
-                num_processes=num_processes,
-                process_index=process_index,
-                split_batches=split_batches,
-            )
-        else:
-            # The block below was removed in Accelerate but it makes the accuracy decrease quite a lot
-            # for a few models and tasks e.g. audio classification with Wav2Vec2 or Seq2SeqQA with T5
-            # Keeping it for now
-            # New batch sampler for the current process.
-            sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
-            if sampler_is_batch_sampler:
-                sampler = dataloader.sampler.sampler
-            else:
-                sampler = dataloader.batch_sampler.sampler
-            if hasattr(sampler, "generator"):
-                if sampler.generator is None:
-                    sampler.generator = torch.Generator()
-                synchronized_generator = sampler.generator
-
-            batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
-            new_batch_sampler = BatchSamplerShard(
-                batch_sampler,
-                num_processes=num_processes,
-                process_index=process_index,
-                split_batches=split_batches,
-                even_batches=even_batches,
-            )
-
-    # We ignore all of those since they are all dealt with by our new_batch_sampler
-    ignore_kwargs = [
-        "batch_size",
-        "shuffle",
-        "sampler",
-        "batch_sampler",
-        "drop_last",
-    ]
-
-    if rng_types is not None and synchronized_generator is None and "generator" in rng_types:
-        rng_types.remove("generator")
-
-    kwargs = {
-        k: getattr(dataloader, k, _PYTORCH_DATALOADER_KWARGS[k])
-        for k in _PYTORCH_DATALOADER_KWARGS
-        if k not in ignore_kwargs
-    }
-
-    # Need to provide batch_size as batch_sampler is None for Iterable dataset
-    if new_batch_sampler is None:
-        kwargs["drop_last"] = dataloader.drop_last
-        kwargs["batch_size"] = (
-            dataloader.batch_size // num_processes if split_batches and not dispatch_batches else dataloader.batch_size
-        )
-    if isinstance(sampler, SeedableRandomSampler) and use_seedable_sampler:
-        if sampler_is_batch_sampler:
-            dataloader.sampler.sampler = sampler
-        else:
-            dataloader.batch_sampler.sampler = sampler
-    if dispatch_batches:
-        kwargs.pop("generator")
-        dataloader = GaudiDataLoaderDispatcher(
-            new_dataset,
-            split_batches=split_batches,
-            batch_sampler=new_batch_sampler,
-            _drop_last=dataloader.drop_last,
-            slice_fn=slice_fn_for_dispatch,
-            **kwargs,
-        )
-    elif sampler_is_batch_sampler:
-        dataloader = DataLoaderShard(
-            new_dataset,
-            device=device if put_on_device else None,
-            sampler=new_batch_sampler,
-            batch_size=dataloader.batch_size,
-            rng_types=rng_types,
-            _drop_last=dataloader.drop_last,
-            synchronized_generator=synchronized_generator,
-            **kwargs,
-        )
-    else:
-        dataloader = DataLoaderShard(
-            new_dataset,
-            device=device if put_on_device else None,
-            batch_sampler=new_batch_sampler,
-            rng_types=rng_types,
-            synchronized_generator=synchronized_generator,
-            _drop_last=dataloader.drop_last,
-            **kwargs,
-        )
-
-    return dataloader
diff --git a/optimum/habana/accelerate/state.py b/optimum/habana/accelerate/state.py
deleted file mode 100644
index eda6ed4b03..0000000000
--- a/optimum/habana/accelerate/state.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import torch
-from accelerate.state import AcceleratorState, PartialState
-from accelerate.utils import is_deepspeed_available, parse_choice_from_env, parse_flag_from_env
-
-from optimum.utils import logging
-
-from .utils import GaudiDistributedType
-
-
-logger = logging.get_logger()
-
-
-class GaudiPartialState(PartialState):
-    """
-    Adapted from: https://github.com/huggingface/accelerate/blob/8514c35192ac9762920f1ab052e5cea4c0e46eeb/src/accelerate/state.py#L96
-    """
-
-    def __init__(self, cpu: bool = False, **kwargs):
-        self.__dict__ = self._shared_state
-        if not self.initialized:
-            self._cpu = cpu
-            self.backend = None
-            env_device = os.environ.get("ACCELERATE_TORCH_DEVICE", None)
-            self.device = torch.device(env_device) if env_device is not None else None
-            self.debug = parse_flag_from_env("ACCELERATE_DEBUG_MODE")
-
-            # initialize_distributed_hpu is already called in the __init__ of
-            # habana_frameworks.torch.distributed.hccl
-            # It is necessary so that the env variable LOCAL_RANK is set before the
-            # conditional statement right below
-            from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu
-
-            if int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
-                world_size, rank, local_rank = initialize_distributed_hpu()
-                self.backend = kwargs.pop("backend", "hccl")
-
-                if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
-                    if not is_deepspeed_available():
-                        raise ImportError(
-                            "DeepSpeed is not available, install it with: `pip install"
-                            " git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0`."
-                        )
-                    self.distributed_type = GaudiDistributedType.DEEPSPEED
-                    import deepspeed
-
-                    if world_size > 1:
-                        os.environ["HLS_MODULE_ID"] = str(local_rank)
-                        os.environ["ID"] = str(rank)
-
-                    deepspeed.init_distributed(dist_backend=self.backend, **kwargs)
-                    logger.info("DeepSpeed is enabled.")
-                    self._mixed_precision = "no"  # deepspeed handles mixed_precision using deepspeed_config
-                elif os.environ.get("ACCELERATE_USE_FSDP", "false") == "true":
-                    self.distributed_type = GaudiDistributedType.FSDP
-                    if not torch.distributed.is_initialized():
-                        torch.distributed.init_process_group(backend=self.backend, rank=rank, world_size=world_size)
-                        logger.info("Enabled distributed run.")
-                else:
-                    self.distributed_type = GaudiDistributedType.MULTI_HPU
-                    if not torch.distributed.is_initialized():
-                        torch.distributed.init_process_group(backend=self.backend, rank=rank, world_size=world_size)
-                        logger.info("Enabled distributed run.")
-                self.num_processes = world_size
-                self.process_index = rank
-                self.local_process_index = local_rank
-                if self.device is None:
-                    # TODO: replace by `torch.device("hpu", self.local_process_index)` when hpu:x is supported
-                    self.device = torch.device("hpu")
-            else:
-                self.distributed_type = (
-                    GaudiDistributedType.NO
-                    if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "false"
-                    else GaudiDistributedType.DEEPSPEED
-                )
-                self.num_processes = 1
-                self.process_index = self.local_process_index = 0
-                logger.info("Single-device run.")
-
-                if self.device is None:
-                    self.device = torch.device("cpu") if cpu else self.default_device
-
-        self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)
-
-    def wait_for_everyone(self):
-        """
-        Will stop the execution of the current process until every other process has reached that point (so this does
-        nothing when the script is only run in one process). Useful to do before saving a model.
-
-        Example:
-
-        ```python
-        >>> # Assuming two GPU processes
-        >>> import time
-        >>> from accelerate.state import PartialState
-
-        >>> state = PartialState()
-        >>> if state.is_main_process:
-        ...     time.sleep(2)
-        >>> else:
-        ...     print("I'm waiting for the main process to finish its sleep...")
-        >>> state.wait_for_everyone()
-        >>> # Should print on every process at the same time
-        >>> print("Everyone is here")
-        ```
-        """
-        if self.distributed_type in (
-            GaudiDistributedType.DEEPSPEED,
-            GaudiDistributedType.MULTI_HPU,
-            GaudiDistributedType.FSDP,
-        ):
-            torch.distributed.barrier()
-
-    @property
-    def default_device(self) -> torch.device:
-        """
-        Returns the default device which is:
-        - HPU if it is available
-        - CPU otherwise
-        """
-        import habana_frameworks.torch.hpu as hthpu
-
-        if hthpu.is_available():
-            return torch.device("hpu")
-        else:
-            return torch.device("cpu")
-
-
-class GaudiAcceleratorState(AcceleratorState):
-    """
-    Adapted from: https://github.com/huggingface/accelerate/blob/8514c35192ac9762920f1ab052e5cea4c0e46eeb/src/accelerate/state.py#L683
-    """
-
-    def __init__(
-        self,
-        mixed_precision: str = None,
-        cpu: bool = False,
-        dynamo_plugin=None,
-        deepspeed_plugin=None,
-        fsdp_plugin=None,
-        megatron_lm_plugin=None,
-        _from_accelerator: bool = False,
-        **kwargs,
-    ):
-        self.__dict__ = self._shared_state
-        if parse_flag_from_env("ACCELERATE_USE_CPU"):
-            cpu = True
-        if GaudiPartialState._shared_state == {}:
-            GaudiPartialState(cpu, **kwargs)
-        self.__dict__.update(GaudiPartialState._shared_state)
-        self._check_initialized(mixed_precision, cpu)
-        if not self.initialized:
-            self.deepspeed_plugin = None
-            self.use_ipex = None
-            mixed_precision = (
-                parse_choice_from_env("ACCELERATE_MIXED_PRECISION", "no")
-                if mixed_precision is None
-                else mixed_precision.lower()
-            )
-            self.is_fp8_enabled = mixed_precision == "fp8"
-            self.dynamo_plugin = dynamo_plugin
-            # deepspeed handles mixed_precision using deepspeed_config
-            self._mixed_precision = (
-                "no" if self.distributed_type == GaudiDistributedType.DEEPSPEED else mixed_precision
-            )
-            if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true" and not cpu:
-                self.deepspeed_plugin = deepspeed_plugin
-            if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true" and not cpu:
-                if self._mixed_precision != "no":
-                    fsdp_plugin.set_mixed_precision(self._mixed_precision)
-                self.fsdp_plugin = fsdp_plugin
-            GaudiPartialState._shared_state["distributed_type"] = self.distributed_type
-            self.use_ipex = False
-
-    @property
-    def mixed_precision(self):
-        if self.distributed_type == GaudiDistributedType.DEEPSPEED:
-            config = self.deepspeed_plugin.deepspeed_config
-            if config.get("fp16", {}).get("enabled", False):
-                mixed_precision = "fp16"
-            elif config.get("bf16", {}).get("enabled", False):
-                mixed_precision = "bf16"
-            else:
-                mixed_precision = "no"
-        else:
-            mixed_precision = self._mixed_precision
-
-        if mixed_precision == "fp16":
-            raise ValueError("fp16 is not supported on Habana Gaudi.")
-
-        return mixed_precision
diff --git a/optimum/habana/accelerate/utils/__init__.py b/optimum/habana/accelerate/utils/__init__.py
deleted file mode 100755
index ee25954b95..0000000000
--- a/optimum/habana/accelerate/utils/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .dataclasses import (
-    GaudiDistributedType,
-    GaudiDynamoBackend,
-    GaudiFP8RecipeKwargs,
-    GaudiFullyShardedDataParallelPlugin,
-    GaudiTorchDynamoPlugin,
-)
-from .transformer_engine import (
-    FP8ContextWrapper,
-    convert_model,
-    get_fp8_recipe,
-)
diff --git a/optimum/habana/accelerate/utils/dataclasses.py b/optimum/habana/accelerate/utils/dataclasses.py
deleted file mode 100644
index 1db6980ee7..0000000000
--- a/optimum/habana/accelerate/utils/dataclasses.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from dataclasses import dataclass
-from enum import Enum
-
-import torch
-from accelerate.utils import FullyShardedDataParallelPlugin
-from accelerate.utils.constants import FSDP_BACKWARD_PREFETCH
-from accelerate.utils.dataclasses import BaseEnum, KwargsHandler, TorchDynamoPlugin
-from accelerate.utils.environment import str_to_bool
-
-
-class GaudiDistributedType(str, Enum):
-    """
-    Represents a type of distributed environment.
-    Adapted from: https://github.com/huggingface/accelerate/blob/8514c35192ac9762920f1ab052e5cea4c0e46eeb/src/accelerate/utils/dataclasses.py#L176
-
-    Values:
-
-        - **NO** -- Not a distributed environment, just a single process.
-        - **MULTI_HPU** -- Distributed on multiple HPUs.
-        - **DEEPSPEED** -- Using DeepSpeed.
-        - **FSDP** -- Using FSDP.
-    """
-
-    # Subclassing str as well as Enum allows the `GaudiDistributedType` to be JSON-serializable out of the box.
-    NO = "NO"
-    MULTI_HPU = "MULTI_HPU"
-    DEEPSPEED = "DEEPSPEED"
-    FSDP = "FSDP"
-
-
-class GaudiDynamoBackend(str, BaseEnum):
-    """
-    Represents a dynamo backend (see https://github.com/pytorch/torchdynamo).
-
-    Values:
-
-        - **NO** -- Do not use torch dynamo.
-        - **EAGER** -- Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo
-          issues.
-        - **AOT_EAGER** -- Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's
-          extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.
-        - **INDUCTOR** -- Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton
-          kernels. [Read
-          more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747)
-        - **AOT_TS_NVFUSER** -- nvFuser with AotAutograd/TorchScript. [Read
-          more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
-        - **NVPRIMS_NVFUSER** -- nvFuser with PrimTorch. [Read
-          more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
-        - **CUDAGRAPHS** -- cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757)
-        - **OFI** -- Uses Torchscript optimize_for_inference. Inference only. [Read
-          more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
-        - **FX2TRT** -- Uses Nvidia TensorRT for inference optimizations. Inference only. [Read
-          more](https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst)
-        - **ONNXRT** -- Uses ONNXRT for inference on CPU/GPU. Inference only. [Read more](https://onnxruntime.ai/)
-        - **TENSORRT** -- Uses ONNXRT to run TensorRT for inference optimizations. [Read
-          more](https://github.com/onnx/onnx-tensorrt)
-        - **IPEX** -- Uses IPEX for inference on CPU. Inference only. [Read
-          more](https://github.com/intel/intel-extension-for-pytorch).
-        - **TVM** -- Uses Apach TVM for inference optimizations. [Read more](https://tvm.apache.org/)
-        - **HPU_BACKEND** -- Uses Intel Gaudi.
-
-    """
-
-    # Subclassing str as well as Enum allows the `SageMakerDistributedType` to be JSON-serializable out of the box.
-    NO = "NO"
-    EAGER = "EAGER"
-    AOT_EAGER = "AOT_EAGER"
-    INDUCTOR = "INDUCTOR"
-    AOT_TS_NVFUSER = "AOT_TS_NVFUSER"
-    NVPRIMS_NVFUSER = "NVPRIMS_NVFUSER"
-    CUDAGRAPHS = "CUDAGRAPHS"
-    OFI = "OFI"
-    FX2TRT = "FX2TRT"
-    ONNXRT = "ONNXRT"
-    TENSORRT = "TENSORRT"
-    IPEX = "IPEX"
-    TVM = "TVM"
-    HPU_BACKEND = "HPU_BACKEND"
-
-
-@dataclass
-class GaudiTorchDynamoPlugin(TorchDynamoPlugin):
-    """
-    This plugin is used to compile a model with PyTorch 2.0 on Gaudi.
-    """
-
-    def __post_init__(self):
-        prefix = "ACCELERATE_DYNAMO_"
-        if self.backend is None:
-            self.backend = os.environ.get(prefix + "BACKEND", "no")
-        self.backend = GaudiDynamoBackend(self.backend.upper())
-        if self.mode is None:
-            self.mode = os.environ.get(prefix + "MODE", "default")
-        if self.fullgraph is None:
-            self.fullgraph = str_to_bool(os.environ.get(prefix + "USE_FULLGRAPH", "False")) == 1
-        if self.dynamic is None:
-            self.dynamic = str_to_bool(os.environ.get(prefix + "USE_DYNAMIC", "False")) == 1
-
-
-@dataclass
-class GaudiFullyShardedDataParallelPlugin(FullyShardedDataParallelPlugin):
-    def __post_init__(self):
-        from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, CPUOffload, ShardingStrategy
-
-        prefix = "FSDP_"
-        if self.sharding_strategy is None:
-            self.sharding_strategy = ShardingStrategy(int(os.environ.get(prefix + "SHARDING_STRATEGY", 1)))
-
-        if self.cpu_offload is None:
-            if str_to_bool(os.environ.get(prefix + "OFFLOAD_PARAMS", "False")) == 1:
-                self.cpu_offload = CPUOffload(offload_params=True)
-            else:
-                self.cpu_offload = CPUOffload(offload_params=False)
-
-        if self.backward_prefetch is None:
-            prefetch_policy = os.environ.get(prefix + "BACKWARD_PREFETCH", "NO_PREFETCH")
-            if prefetch_policy != FSDP_BACKWARD_PREFETCH[-1]:
-                self.backward_prefetch = BackwardPrefetch(FSDP_BACKWARD_PREFETCH.index(prefetch_policy) + 1)
-
-        if self.state_dict_type is None:
-            state_dict_type_policy = os.environ.get(prefix + "STATE_DICT_TYPE", "FULL_STATE_DICT")
-            self.set_state_dict_type(state_dict_type_policy)
-        self.use_orig_params = str_to_bool(os.environ.get(prefix + "USE_ORIG_PARAMS", "False")) == 1
-        self.sync_module_states = str_to_bool(os.environ.get(prefix + "SYNC_MODULE_STATES", "True")) == 1
-        self.forward_prefetch = str_to_bool(os.environ.get(prefix + "FORWARD_PREFETCH", "False")) == 1
-        self.activation_checkpointing = str_to_bool(os.environ.get(prefix + "ACTIVATION_CHECKPOINTING", "False")) == 1
-
-        if self.sync_module_states:
-            device = torch.device("hpu", torch.hpu.current_device())
-            self.param_init_fn = lambda x: x.to_empty(device=device, recurse=False)
-
-
-@dataclass
-class GaudiFP8RecipeKwargs(KwargsHandler):
-    """
-    Use this object in your [`Accelerator`] to customize the initialization of the recipe for FP8 mixed precision training with `transformer-engine`.
-
-    Adapted from: https://github.com/huggingface/accelerate/blob/v0.27.2/src/accelerate/utils/dataclasses.py#L180
-
-    Args:
-        margin (`int`, *optional*, defaults to 0):
-            The margin to use for the scaling factor computation.
-        interval (`int`, *optional*, defaults to 16):
-            The interval to use for how often the scaling factor is recomputed.
-        fp8_format (`str`, *optional*, defaults to "HYBRID"):
-            The format to use for the FP8 recipe. Must be one of `E5M2` or `HYBRID`.
-        amax_history_len (`int`, *optional*, defaults to 1):
-            The length of the history to use for the scaling factor computation
-        amax_compute_algo (`str`, *optional*, defaults to "most_recent"):
-            The algorithm to use for the scaling factor computation. Must be one of `max` or `most_recent`.
-        reduce_amax (`bool`, *optional*, defaults to "False"):
-            By default, if `torch.distributed` is initialized, the `amax` value for FP8
-            tensors is reduced across the `fp8_group` (specified in the `fp8_autocast`
-            call). This keeps the amaxes and scaling factors synced across the given
-            distributed group. If set to `False`, this reduction is skipped and every
-            HPU maintains local amaxes and scaling factors. To ensure results are
-            numerically identical across checkpointing boundaries in this case, all
-            ranks must checkpoint in order to store the local tensors.
-    """
-
-    margin: int = 0
-    interval: int = 16
-    fp8_format: str = "HYBRID"
-    amax_compute_algo: str = "most_recent"
-    amax_history_len: int = 1
-    reduce_amax: bool = False
-
-    def __post_init__(self):
-        self.fp8_format = self.fp8_format.upper()
-        assert self.fp8_format in ("E5M2", "HYBRID"), "Only E5M2 and HYBRID FP8 formats are currently supported."
-        assert self.amax_compute_algo in (
-            "max",
-            "most_recent",
-        ), "Only max and most_recent `amax_compute_algo` modes are currently supported."
diff --git a/optimum/habana/accelerate/utils/operations.py b/optimum/habana/accelerate/utils/operations.py
deleted file mode 100644
index 21b01680df..0000000000
--- a/optimum/habana/accelerate/utils/operations.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-A set of basic tensor ops compatible with tpu, gpu, and multigpu
-"""
-
-import torch
-from accelerate.utils.operations import _gpu_broadcast, is_tensor_information, recursively_apply
-
-from ..state import GaudiPartialState
-from ..utils import GaudiDistributedType
-
-
-def initialize_tensors(data_structure):
-    """
-    Recursively initializes tensors from a nested list/tuple/dictionary of [`~utils.TensorInformation`].
-
-    Returns:
-        The same data structure as `data` with tensors instead of [`~utils.TensorInformation`].
-    """
-
-    def _initialize_tensor(tensor_info):
-        return torch.zeros(*tensor_info.shape, dtype=tensor_info.dtype)
-
-    return recursively_apply(_initialize_tensor, data_structure, test_type=is_tensor_information)
-
-
-def broadcast(tensor, from_process: int = 0):
-    """
-    Recursively broadcast tensor in a nested list/tuple/dictionary of tensors to all devices.
-
-    Args:
-        tensor (nested list/tuple/dictionary of `torch.Tensor`):
-            The data to gather.
-        from_process (`int`, *optional*, defaults to 0):
-            The process from which to send the data
-
-    Returns:
-        The same data structure as `tensor` with all tensors broadcasted to the proper device.
-    """
-    if GaudiPartialState().distributed_type in [GaudiDistributedType.MULTI_HPU, GaudiDistributedType.DEEPSPEED]:
-        return _gpu_broadcast(tensor, src=from_process)
-    return tensor
-
-
-def broadcast_object_list(object_list, from_process: int = 0):
-    """
-    Broadcast a list of picklable objects form one process to the others.
-
-    Args:
-        object_list (list of picklable objects):
-            The list of objects to broadcast. This list will be modified inplace.
-        from_process (`int`, *optional*, defaults to 0):
-            The process from which to send the data.
-
-    Returns:
-        The same list containing the objects from process 0.
-    """
-    if GaudiPartialState().distributed_type in [GaudiDistributedType.MULTI_HPU, GaudiDistributedType.DEEPSPEED]:
-        torch.distributed.broadcast_object_list(object_list, src=from_process, device="hpu")
-    return object_list
diff --git a/optimum/habana/accelerate/utils/transformer_engine.py b/optimum/habana/accelerate/utils/transformer_engine.py
deleted file mode 100755
index c4aaba69a7..0000000000
--- a/optimum/habana/accelerate/utils/transformer_engine.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-
-import torch
-
-
-has_transformer_engine = False
-
-
-def import_te():
-    global te, has_transformer_engine
-    try:
-        import habana_frameworks.torch.hpex.experimental.transformer_engine as te
-
-        has_transformer_engine = True
-
-    except ImportError:
-        has_transformer_engine = False
-
-
-def is_fp8_available():
-    if not has_transformer_engine:
-        import_te()
-    return has_transformer_engine
-
-
-def _convert_model(model, to_transformer_engine=True, _convert_linear=True):
-    """
-    Recursively converts the linear layer of a model to their `transformers_engine` counterpart.
-    """
-    from optimum.habana.transformers.models.llama.modeling_llama import ModuleFusedSDPA
-
-    if not is_fp8_available():
-        raise ImportError("Using `convert_model` requires transformer_engine to be installed.")
-    for name, module in model.named_children():
-        if isinstance(module, torch.nn.Linear) and to_transformer_engine and _convert_linear:
-            has_bias = module.bias is not None
-            # Initializing TE linear without weights and biases and shallow copying them from the original module.
-            te_module = te.Linear(
-                module.in_features,
-                module.out_features,
-                bias=has_bias,
-                params_dtype=module.weight.dtype,
-                skip_weight_param_allocation=True,
-            )
-            te_module.weight = module.weight
-
-            if has_bias:
-                te_module.bias = module.bias
-
-            setattr(model, name, te_module)
-        elif isinstance(module, te.Linear) and not to_transformer_engine and _convert_linear:
-            has_bias = module.bias is not None
-            new_module = torch.nn.Linear(
-                module.in_features,
-                module.out_features,
-                bias=has_bias,
-                dtype=module.weight.dtype,
-                device=module.weight.device,
-            )
-            new_module.weight.copy_(module.weight)
-            if has_bias:
-                new_module.bias.copy_(module.bias)
-
-            setattr(model, name, new_module)
-        elif isinstance(module, ModuleFusedSDPA) and module.flash_attention_fp8 and to_transformer_engine:
-            from habana_frameworks.torch.hpex.experimental.transformer_engine import FusedAttention as te_FusedAttention
-            module._hpu_kernel_fsdpa = te_FusedAttention(
-                scale=module.scale,
-                attention_dropout=module.attention_dropout,
-                enable_recompute=module.enable_recompute,
-            )
-            setattr(model, name, module)
-        else:
-            _convert_model(module, to_transformer_engine=to_transformer_engine, _convert_linear=_convert_linear)
-
-
-def has_transformer_engine_layers(model):
-    """
-    Returns whether a given model has some `transformer_engine` layer or not.
-    """
-    if not is_fp8_available():
-        raise ImportError("Using `has_transformer_engine_layers` requires transformer_engine to be installed.")
-    for m in model.modules():
-        if isinstance(m, (te.Linear)):
-            return True
-    return False
-
-
-def convert_model(model):
-    """
-    Converts torch.nn.Linear modules to `transformers_engine` Linear modules.
-    Adapted from: https://github.com/huggingface/accelerate/blob/v0.27.2/src/accelerate/accelerator.py#L1303
-    """
-    if not has_transformer_engine_layers(model):
-        with torch.no_grad():
-            _convert_model(model)
-        model._converted_to_transformer_engine = True
-    return model
-
-
-def get_fp8_recipe(fp8_recipe_handler):
-    """
-    Creates transformer engine FP8 recipe object.
-    Adapted from: https://github.com/huggingface/accelerate/blob/v0.27.2/src/accelerate/accelerator.py#L1309
-    """
-    if not is_fp8_available():
-        raise ImportError("Using `get_fp8_recipe` requires transformer_engine to be installed.")
-    kwargs = fp8_recipe_handler.to_dict() if fp8_recipe_handler is not None else {}
-    if "fp8_format" in kwargs:
-        kwargs["fp8_format"] = getattr(te.recipe.Format, kwargs["fp8_format"])
-    fp8_recipe_handler = te.recipe.DelayedScaling(**kwargs)
-    fp8_recipe_handler.backend = "TE"
-    return fp8_recipe_handler
-
-
-class FP8ContextWrapper:
-    """
-    Helper class for FP8 context related operations.
-    """
-
-    def __init__(self, ctx, fp8_recipe):
-        self.ctx = ctx
-        self.fp8_ctx = self.create_fp8_context(fp8_recipe)
-
-    def __enter__(self):
-        self.ctx.__enter__()
-        self.fp8_ctx.__enter__()
-
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        self.fp8_ctx.__exit__(exc_type, exc_value, exc_traceback)
-        self.ctx.__exit__(exc_type, exc_value, exc_traceback)
-
-    @staticmethod
-    def create_fp8_context(fp8_recipe):
-        return te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)
-
-    @staticmethod
-    def _gradient_checkpointing_wrap(func, *args, **kwargs):
-        """
-        `_gradient_checkpointing_func` always takes the function to be recomputed as the first argument. The function
-        below wraps this first argument with `transformer_engine`'s `activation_checkpointing` context.
-        """
-        _args = list(args)
-        _args[0] = te.distributed.activation_checkpointing()(_args[0])
-        args = tuple(_args)
-
-        return func(*args, **kwargs)
-
-    @staticmethod
-    def gradient_checkpointing_wrap(model):
-        """
-        Wrap `_gradient_checkpointing_func` in the model with `transformer_engine`'s `activation_checkpointing` context.
-        This context is used to signal the `transformer_engine` modules whether they have been called with activation checkpointing enabled or not.
-        """
-        if hasattr(model, "gradient_checkpointing") and model.gradient_checkpointing:
-            model._gradient_checkpointing_func = functools.partial(
-                FP8ContextWrapper._gradient_checkpointing_wrap, model._gradient_checkpointing_func
-            )
-            return
-
-        for module in model.modules():
-            if hasattr(module, "gradient_checkpointing") and module.gradient_checkpointing:
-                module._gradient_checkpointing_func = functools.partial(
-                    FP8ContextWrapper._gradient_checkpointing_wrap, module._gradient_checkpointing_func
-                )
diff --git a/optimum/habana/checkpoint_utils.py b/optimum/habana/checkpoint_utils.py
deleted file mode 100644
index 895b721518..0000000000
--- a/optimum/habana/checkpoint_utils.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import json
-import os
-from pathlib import Path
-
-import torch
-from huggingface_hub import list_repo_files, snapshot_download
-from transformers import modeling_utils
-from transformers.utils import is_offline_mode
-
-
-def get_repo_root(model_name_or_path, local_rank=-1, token=None):
-    """
-    Downloads the specified model checkpoint and returns the repository where it was downloaded.
-    """
-    if Path(model_name_or_path).is_dir():
-        # If it is a local model, no need to download anything
-        return model_name_or_path
-    else:
-        # Checks if online or not
-        if is_offline_mode():
-            if local_rank == 0:
-                print("Offline mode: forcing local_files_only=True")
-
-        # Only download PyTorch weights by default
-        if any(
-            ".safetensors" in filename for filename in list_repo_files(model_name_or_path, token=token)
-        ):  # Some models like Falcon-180b are in only safetensors format
-            allow_patterns = ["*.safetensors"]
-        elif any(".bin" in filename for filename in list_repo_files(model_name_or_path, token=token)):
-            allow_patterns = ["*.bin"]
-        else:
-            raise TypeError("Only PyTorch models are supported")
-
-        # Download only on first process
-        if local_rank in [-1, 0]:
-            cache_dir = snapshot_download(
-                model_name_or_path,
-                local_files_only=is_offline_mode(),
-                cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
-                allow_patterns=allow_patterns,
-                max_workers=16,
-                token=token,
-            )
-            if local_rank == -1:
-                # If there is only one process, then the method is finished
-                return cache_dir
-
-        # Make all processes wait so that other processes can get the checkpoint directly from cache
-        if torch.distributed.is_initialized():
-            torch.distributed.barrier()
-
-        return snapshot_download(
-            model_name_or_path,
-            local_files_only=is_offline_mode(),
-            cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
-            allow_patterns=allow_patterns,
-            token=token,
-        )
-
-
-def get_checkpoint_files(model_name_or_path, local_rank, token=None):
-    cached_repo_dir = get_repo_root(model_name_or_path, local_rank=local_rank, token=token)
-
-    # Extensions: .bin | .safetensors | .pt
-    # Creates a list of paths from all downloaded files in cache dir
-
-    if any(file.suffix == ".bin" for file in Path(cached_repo_dir).rglob("*")):
-        (name, ext) = os.path.splitext(modeling_utils.WEIGHTS_NAME)
-    elif any(file.suffix == ".safetensors" for file in Path(cached_repo_dir).rglob("*")):
-        (name, ext) = os.path.splitext(modeling_utils.SAFE_WEIGHTS_NAME)
-    else:
-        (name, ext) = ("*", ".pt")
-
-    file_list = [
-        str(entry)
-        for entry in Path(cached_repo_dir).rglob("*")
-        if (entry.is_file() and entry.name.startswith(name) and entry.name.endswith(ext))
-    ]
-
-    return file_list
-
-
-def write_checkpoints_json(model_name_or_path, local_rank, f, token=None):
-    """
-    Dumps metadata into a JSON file for DeepSpeed-inference.
-    """
-    checkpoint_files = get_checkpoint_files(model_name_or_path, local_rank, token)
-    data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
-    json.dump(data, f)
-    f.flush()
-
-
-def model_on_meta(config):
-    """
-    Checks if load the model to meta.
-    """
-    return config.model_type in ["bloom", "llama", "falcon", "mixtral"]
-
-
-def get_optimized_model_name(config):
-    from .transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
-
-    for model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
-        if model_type == config.model_type:
-            return model_type
-
-    return None
-
-
-def model_is_optimized(config):
-    """
-    Checks if the given config belongs to a model in optimum/habana/transformers/models, which has a
-    new input token_idx.
-    """
-    return get_optimized_model_name(config) is not None
-
-
-def get_ds_injection_policy(config):
-    model_type = get_optimized_model_name(config)
-    policy = {}
-    if model_type:
-        if model_type == "bloom":
-            from transformers.models.bloom.modeling_bloom import BloomBlock
-
-            policy = {BloomBlock: ("self_attention.dense", "mlp.dense_4h_to_h")}
-
-        if model_type == "opt":
-            from transformers.models.opt.modeling_opt import OPTDecoderLayer
-
-            policy = {OPTDecoderLayer: ("self_attn.out_proj", ".fc2")}
-
-        if model_type == "gpt2":
-            from transformers.models.gpt2.modeling_gpt2 import GPT2MLP
-
-            policy = {GPT2MLP: ("attn.c_proj", "mlp.c_proj")}
-
-        if model_type == "gptj":
-            from transformers.models.gptj.modeling_gptj import GPTJBlock
-
-            policy = {GPTJBlock: ("attn.out_proj", "mlp.fc_out")}
-
-        if model_type == "gpt_neox":
-            from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer
-
-            policy = {GPTNeoXLayer: ("attention.dense", "mlp.dense_4h_to_h")}
-
-        if model_type == "llama":
-            from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-
-            policy = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
-
-    return policy
diff --git a/optimum/habana/diffusers/__init__.py b/optimum/habana/diffusers/__init__.py
deleted file mode 100644
index 26d5d2d359..0000000000
--- a/optimum/habana/diffusers/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .pipelines.controlnet.pipeline_controlnet import GaudiStableDiffusionControlNetPipeline
-from .pipelines.pipeline_utils import GaudiDiffusionPipeline
-from .pipelines.stable_diffusion.pipeline_stable_diffusion import GaudiStableDiffusionPipeline
-from .pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d import GaudiStableDiffusionLDM3DPipeline
-from .pipelines.stable_diffusion.pipeline_stable_diffusion_upscale import GaudiStableDiffusionUpscalePipeline
-from .pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import GaudiStableDiffusionXLPipeline
-from .pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import GaudiStableVideoDiffusionPipeline
-from .schedulers import GaudiDDIMScheduler, GaudiEulerAncestralDiscreteScheduler, GaudiEulerDiscreteScheduler
diff --git a/optimum/habana/diffusers/models/__init__.py b/optimum/habana/diffusers/models/__init__.py
deleted file mode 100644
index 245359bcf2..0000000000
--- a/optimum/habana/diffusers/models/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .unet_2d_condition import gaudi_unet_2d_condition_model_forward
diff --git a/optimum/habana/diffusers/models/attention_processor.py b/optimum/habana/diffusers/models/attention_processor.py
deleted file mode 100755
index 0e39eb4678..0000000000
--- a/optimum/habana/diffusers/models/attention_processor.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import os
-from typing import Optional, Union
-
-import torch
-import torch.nn.functional as F
-from diffusers.models.attention_processor import Attention
-from diffusers.utils import USE_PEFT_BACKEND, logging
-from diffusers.utils.import_utils import is_xformers_available
-from torch import nn
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-if is_xformers_available():
-    import xformers
-    import xformers.ops
-else:
-    xformers = None
-
-
-class Softmax(nn.Module):
-      def __init__(self):
-          super().__init__()
-
-      def forward(self, x, dim = None, invAttnHead= None):
-          return torch.ops.hpu.softmax_fp8(x, dim,  None, None, invAttnHead)
-
-class Matmul(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, *args, **kwargs):
-        return torch.matmul(*args, **kwargs)
-
-# ScaledDotProductAttention is based on torch.nn.functional.scaled_dot_product_attention
-class ScaledDotProductAttention(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.bmm1 = Matmul()
-        self.bmm2 = Matmul()
-        self.softmax = Softmax()
-
-    def forward(self, query, key, value, attn_mask=None, dropout_p=0.0,
-    is_causal=False, scale=None) -> torch.Tensor:
-        # Efficient implementation:
-        L, S = query.size(-2), key.size(-2)
-        scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
-        invAttnHead = torch.tensor(scale_factor, dtype=torch.float32).to('hpu')
-        attn_bias = torch.zeros(L, S, dtype=query.dtype)
-
-        if is_causal:
-            assert attn_mask is None
-            temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
-            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-            attn_bias.to(query.dtype)
-
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask
-
-        if(S<128):
-            attn_weight = self.bmm1(key,query.transpose(-2, -1))
-            attn_weight = self.softmax(attn_weight, dim=-2, invAttnHead=invAttnHead)
-            attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
-            return self.bmm2(attn_weight.transpose(-2, -1), value)
-        else:
-            attn_weight = self.bmm1(query, key.transpose(-2, -1))
-            attn_weight = self.softmax(attn_weight, dim=-1, invAttnHead=invAttnHead)
-            attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
-            return self.bmm2(attn_weight, value)
-
-
-# Copied from diffusers.models.attention_processor.AttnProcessor2_0
-class AttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-
-    def __init__(self, attention_module=None):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-        self.attention_module = attention_module
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
-    ) -> torch.FloatTensor:
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        args = () if USE_PEFT_BACKEND else (scale,)
-        query = attn.to_q(hidden_states, *args)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states, *args)
-        value = attn.to_v(encoder_hidden_states, *args)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        # hidden_states = F.scaled_dot_product_attention(
-        #     query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        # )
-        if os.environ.get('PATCH_SDPA') is not None:
-                hidden_states = self.attention_module(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-            )
-        else:
-            import habana_frameworks.torch.hpu as ht
-            from habana_frameworks.torch.hpex.kernels import FusedSDPA
-            with ht.sdp_kernel(enable_recompute = True):
-                hidden_states = FusedSDPA.apply(query, key, value, attention_mask, 0.0, False)
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states, *args)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-AttentionProcessor = Union[
-    AttnProcessor2_0,
-]
diff --git a/optimum/habana/diffusers/models/unet_2d_condition.py b/optimum/habana/diffusers/models/unet_2d_condition.py
deleted file mode 100644
index 19d097e0e7..0000000000
--- a/optimum/habana/diffusers/models/unet_2d_condition.py
+++ /dev/null
@@ -1,352 +0,0 @@
-from typing import Any, Dict, Optional, Tuple, Union
-
-import habana_frameworks.torch.core as htcore
-import torch
-import torch.utils.checkpoint
-from diffusers.models.unet_2d_condition import UNet2DConditionOutput
-from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.forward
-def gaudi_unet_2d_condition_model_forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
-        mid_block_additional_residual: Optional[torch.Tensor] = None,
-        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
-        r"""
-        Copied from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/models/unets/unet_2d_condition.py#L843
-
-        Changes:
-          - Adds a workaround to be able to compute `conv_in` with Torch Autocast and full bf16 precision.
-          - Added mark_step in unet forward
-        """
-        # By default samples have to be AT least a multiple of the overall upsampling factor.
-        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
-        # However, the upsampling interpolation output size can be forced to fit any upsampling size
-        # on the fly if necessary.
-        default_overall_up_factor = 2**self.num_upsamplers
-
-        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
-        forward_upsample_size = False
-        upsample_size = None
-
-        for dim in sample.shape[-2:]:
-            if dim % default_overall_up_factor != 0:
-                # Forward upsample size to force interpolation output size.
-                forward_upsample_size = True
-                break
-
-        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
-        # expects mask of shape:
-        #   [batch, key_tokens]
-        # adds singleton query_tokens dimension:
-        #   [batch,                    1, key_tokens]
-        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
-        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
-        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
-        if attention_mask is not None:
-            # assume that mask is expressed as:
-            #   (1 = keep,      0 = discard)
-            # convert mask into a bias that can be added to attention scores:
-            #       (keep = +0,     discard = -10000.0)
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # convert encoder_attention_mask to a bias the same way we do for attention_mask
-        if encoder_attention_mask is not None:
-            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
-            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
-
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-
-        t_emb = self.time_proj(timesteps)
-        htcore.mark_step()
-
-        # `Timesteps` does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=sample.dtype)
-
-        emb = self.time_embedding(t_emb, timestep_cond)
-        aug_emb = None
-
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-
-                # `Timesteps` does not contain any weights and will always return f32 tensors
-                # there might be better ways to encapsulate this.
-                class_labels = class_labels.to(dtype=sample.dtype)
-
-            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
-
-            if self.config.class_embeddings_concat:
-                emb = torch.cat([emb, class_emb], dim=-1)
-            else:
-                emb = emb + class_emb
-
-        if self.config.addition_embed_type == "text":
-            aug_emb = self.add_embedding(encoder_hidden_states)
-        elif self.config.addition_embed_type == "text_image":
-            # Kandinsky 2.1 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
-                )
-
-            image_embs = added_cond_kwargs.get("image_embeds")
-            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
-            aug_emb = self.add_embedding(text_embs, image_embs)
-        elif self.config.addition_embed_type == "text_time":
-            # SDXL - style
-            if "text_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
-                )
-            text_embeds = added_cond_kwargs.get("text_embeds")
-            if "time_ids" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
-                )
-            time_ids = added_cond_kwargs.get("time_ids")
-            time_embeds = self.add_time_proj(time_ids.flatten())
-            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
-            add_embeds = add_embeds.to(emb.dtype)
-            aug_emb = self.add_embedding(add_embeds)
-        elif self.config.addition_embed_type == "image":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
-                )
-            image_embs = added_cond_kwargs.get("image_embeds")
-            aug_emb = self.add_embedding(image_embs)
-        elif self.config.addition_embed_type == "image_hint":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
-                )
-            image_embs = added_cond_kwargs.get("image_embeds")
-            hint = added_cond_kwargs.get("hint")
-            aug_emb, hint = self.add_embedding(image_embs, hint)
-            sample = torch.cat([sample, hint], dim=1)
-
-        emb = emb + aug_emb if aug_emb is not None else emb
-
-        if self.time_embed_act is not None:
-            emb = self.time_embed_act(emb)
-
-        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
-            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
-            # Kadinsky 2.1 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            image_embeds = self.encoder_hid_proj(image_embeds)
-            encoder_hidden_states = (encoder_hidden_states, image_embeds)
-
-        # 2. pre-process
-        import habana_frameworks.torch.hpu as hthpu
-
-        # Workaround for SynapseAI 1.11 for Torch Autocast
-        # TODO: to remove in SynapseAI 1.13?
-        if hthpu.is_autocast_hpu_enabled():
-            sample = self.conv_in(sample.to(torch.float))
-        # Workaround for Synapse 1.11 for full bf16
-        elif self.conv_in.bias.dtype == torch.float and sample.dtype == torch.bfloat16:
-            sample = self.conv_in(sample.to(torch.float)).to(torch.bfloat16)
-        else:
-            sample = self.conv_in(sample)
-
-        # 2.5 GLIGEN position net
-        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
-            cross_attention_kwargs = cross_attention_kwargs.copy()
-            gligen_args = cross_attention_kwargs.pop("gligen")
-            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
-
-        # 3. down
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-        if USE_PEFT_BACKEND:
-            # weight the lora layers by setting `lora_scale` for each PEFT layer
-            scale_lora_layers(self, lora_scale)
-
-        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
-        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
-        is_adapter = down_intrablock_additional_residuals is not None
-        # maintain backward compatibility for legacy usage, where
-        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
-        #       but can only use one or the other
-        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
-            deprecate(
-                "T2I should not use down_block_additional_residuals",
-                "1.3.0",
-                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
-                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
-                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
-                standard_warn=False,
-            )
-            down_intrablock_additional_residuals = down_block_additional_residuals
-            is_adapter = True
-
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                # For t2i-adapter CrossAttnDownBlock2D
-                additional_residuals = {}
-                if is_adapter and len(down_intrablock_additional_residuals) > 0:
-                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
-
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    encoder_attention_mask=encoder_attention_mask,
-                    **additional_residuals,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
-                if is_adapter and len(down_intrablock_additional_residuals) > 0:
-                    sample += down_intrablock_additional_residuals.pop(0)
-
-            down_block_res_samples += res_samples
-
-        if is_controlnet:
-            new_down_block_res_samples = ()
-
-            for down_block_res_sample, down_block_additional_residual in zip(
-                down_block_res_samples, down_block_additional_residuals
-            ):
-                down_block_res_sample = down_block_res_sample + down_block_additional_residual
-                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
-
-            down_block_res_samples = new_down_block_res_samples
-
-        # 4. mid
-        if self.mid_block is not None:
-            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
-                sample = self.mid_block(
-                    sample,
-                    emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    encoder_attention_mask=encoder_attention_mask,
-                )
-            else:
-                sample = self.mid_block(sample, emb)
-
-            # To support T2I-Adapter-XL
-            if (
-                is_adapter
-                and len(down_intrablock_additional_residuals) > 0
-                and sample.shape == down_intrablock_additional_residuals[0].shape
-            ):
-                sample += down_intrablock_additional_residuals.pop(0)
-
-        if is_controlnet:
-            sample = sample + mid_block_additional_residual
-
-        # 5. up
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block and forward_upsample_size:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size,
-                    scale=lora_scale,
-                )
-
-        # 6. post-process
-        if self.conv_norm_out:
-            sample = self.conv_norm_out(sample)
-            sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        if USE_PEFT_BACKEND:
-            # remove `lora_scale` from each PEFT layer
-            unscale_lora_layers(self, lora_scale)
-
-        if not return_dict:
-            return (sample,)
-
-        return UNet2DConditionOutput(sample=sample)
diff --git a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
deleted file mode 100644
index 57675812f7..0000000000
--- a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ /dev/null
@@ -1,838 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-from math import ceil
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-from diffusers.image_processor import PipelineImageInput
-from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
-from diffusers.pipelines.controlnet import StableDiffusionControlNetPipeline
-from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import deprecate
-from diffusers.utils.torch_utils import is_compiled_module
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-
-from optimum.utils import logging
-
-from ....transformers.gaudi_configuration import GaudiConfig
-from ....utils import HabanaProfile, speed_metrics
-from ..pipeline_utils import GaudiDiffusionPipeline
-from ..stable_diffusion.pipeline_stable_diffusion import (
-    GaudiStableDiffusionPipeline,
-    GaudiStableDiffusionPipelineOutput,
-    retrieve_timesteps,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-class GaudiStableDiffusionControlNetPipeline(GaudiDiffusionPipeline, StableDiffusionControlNetPipeline):
-    """
-    Adapted from: https://github.com/huggingface/diffusers/blob/v0.23.1/src/diffusers/pipelines/controlnet/pipeline_controlnet.py#L94
-    - Generation is performed by batches
-    - Two `mark_step()` were added to add support for lazy mode
-    - Added support for HPU graphs
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
-        text_encoder ([`~transformers.CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer (`~transformers.CLIPTokenizer`):
-            A `CLIPTokenizer` to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A `UNet2DConditionModel` to denoise the encoded image latents.
-        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
-            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
-            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
-            additional conditioning.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
-            about a model's potential harms.
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
-        use_habana (bool, defaults to `False`):
-            Whether to use Gaudi (`True`) or CPU (`False`).
-        use_hpu_graphs (bool, defaults to `False`):
-            Whether to use HPU graphs or not.
-        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
-            Gaudi configuration to use. Can be a string to download it from the Hub.
-            Or a previously initialized config can be passed.
-        bf16_full_eval (bool, defaults to `False`):
-            Whether to use full bfloat16 evaluation instead of 32-bit.
-            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel]],
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        requires_safety_checker: bool = True,
-        use_habana: bool = False,
-        use_hpu_graphs: bool = False,
-        gaudi_config: Union[str, GaudiConfig] = None,
-        bf16_full_eval: bool = False,
-    ):
-        GaudiDiffusionPipeline.__init__(
-            self,
-            use_habana,
-            use_hpu_graphs,
-            gaudi_config,
-            bf16_full_eval,
-        )
-
-        StableDiffusionControlNetPipeline.__init__(
-            self,
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            controlnet,
-            scheduler,
-            safety_checker,
-            feature_extractor,
-            image_encoder,
-            requires_safety_checker,
-        )
-
-        self.to(self._device)
-
-    def prepare_latents(self, num_images, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (num_images, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != num_images:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective number"
-                f" of images of {num_images}. Make sure the number of images matches the length of the generators."
-            )
-
-        if latents is None:
-            # torch.randn is broken on HPU so running it on CPU
-            rand_device = "cpu" if device.type == "hpu" else device
-            if isinstance(generator, list):
-                shape = (1,) + shape[1:]
-                latents = [
-                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
-                    for i in range(num_images)
-                ]
-                latents = torch.cat(latents, dim=0).to(device)
-            else:
-                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: PipelineImageInput = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        batch_size: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
-        guess_mode: bool = False,
-        control_guidance_start: Union[float, List[float]] = 0.0,
-        control_guidance_end: Union[float, List[float]] = 1.0,
-        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        **kwargs,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
-                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
-                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
-                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
-                `init`, images must be passed as a list such that each element of the list can be correctly batched for
-                input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet,
-                each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets,
-                where a list of image lists can be passed to batch for each prompt and each ControlNet.
-            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images in a batch.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
-                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
-                the corresponding scale as a list.
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                The ControlNet encoder tries to recognize the content of the input image even if you remove all
-                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
-            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
-                The percentage of total steps at which the ControlNet starts applying.
-            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The percentage of total steps at which the ControlNet stops applying.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
-            profiling_warmup_steps (`int`, *optional*):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*):
-                Number of steps to be captured when enabling profiling.
-
-        Returns:
-            [`~diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated images and the
-                second element is a list of `bool`s indicating whether the corresponding generated image contains
-                "not-safe-for-work" (nsfw) content.
-        """
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-
-        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
-
-        # align format for control guidance
-        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
-            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
-        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
-            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
-        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
-            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
-            control_guidance_start, control_guidance_end = (
-                mult * [control_guidance_start],
-                mult * [control_guidance_end],
-            )
-
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
-            # 1. Check inputs. Raise error if not correct
-            self.check_inputs(
-                prompt,
-                image,
-                callback_steps,
-                negative_prompt,
-                prompt_embeds,
-                negative_prompt_embeds,
-                controlnet_conditioning_scale,
-                control_guidance_start,
-                control_guidance_end,
-                callback_on_step_end_tensor_inputs,
-            )
-
-            self._guidance_scale = guidance_scale
-            self._clip_skip = clip_skip
-            self._cross_attention_kwargs = cross_attention_kwargs
-
-            # 2. Define call parameters
-            if prompt is not None and isinstance(prompt, str):
-                num_prompts = 1
-            elif prompt is not None and isinstance(prompt, list):
-                num_prompts = len(prompt)
-            else:
-                num_prompts = prompt_embeds.shape[0]
-            num_batches = ceil((num_images_per_prompt * num_prompts) / batch_size)
-
-            device = self._execution_device
-            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-            # corresponds to doing no classifier free guidance.
-            do_classifier_free_guidance = guidance_scale > 1.0
-
-            if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
-                controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
-
-            global_pool_conditions = (
-                controlnet.config.global_pool_conditions
-                if isinstance(controlnet, ControlNetModel)
-                else controlnet.nets[0].config.global_pool_conditions
-            )
-            guess_mode = guess_mode or global_pool_conditions
-
-            # 3. Encode input prompt
-            text_encoder_lora_scale = (
-                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
-            )
-            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-                prompt,
-                device,
-                num_images_per_prompt,
-                self.do_classifier_free_guidance,
-                negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                lora_scale=text_encoder_lora_scale,
-                clip_skip=self.clip_skip,
-            )
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            # if do_classifier_free_guidance:
-            #    prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-            if ip_adapter_image is not None:
-                image_embeds = self.prepare_ip_adapter_image_embeds(
-                    ip_adapter_image, device, batch_size * num_images_per_prompt
-                )
-
-            # 4. Prepare image
-            if isinstance(controlnet, ControlNetModel):
-                image = self.prepare_image(
-                    image=image,
-                    width=width,
-                    height=height,
-                    batch_size=batch_size,
-                    num_images_per_prompt=num_images_per_prompt,
-                    device=device,
-                    dtype=controlnet.dtype,
-                    do_classifier_free_guidance=self.do_classifier_free_guidance,
-                    guess_mode=guess_mode,
-                )
-                height, width = image.shape[-2:]
-            elif isinstance(controlnet, MultiControlNetModel):
-                images = []
-
-                # Nested lists as ControlNet condition
-                if isinstance(image[0], list):
-                    # Transpose the nested image list
-                    image = [list(t) for t in zip(*image)]
-
-                for image_ in image:
-                    image_ = self.prepare_image(
-                        image=image_,
-                        width=width,
-                        height=height,
-                        batch_size=batch_size,
-                        num_images_per_prompt=num_images_per_prompt,
-                        device=device,
-                        dtype=controlnet.dtype,
-                        do_classifier_free_guidance=self.do_classifier_free_guidance,
-                        guess_mode=guess_mode,
-                    )
-                    images.append(image_)
-
-                image = images
-                height, width = image[0].shape[-2:]
-            else:
-                assert False
-
-            # 5. Prepare timesteps
-            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
-            self._num_timesteps = len(timesteps)
-
-            # 6. Prepare latent variables
-            num_channels_latents = self.unet.config.in_channels
-            latents = self.prepare_latents(
-                num_prompts * num_images_per_prompt,
-                num_channels_latents,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-            )
-
-            # 6.5 Optionally get Guidance Scale Embedding
-            timestep_cond = None
-            if self.unet.config.time_cond_proj_dim is not None:
-                guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
-                    batch_size * num_images_per_prompt
-                )
-                timestep_cond = self.get_guidance_scale_embedding(
-                    guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-                ).to(device=device, dtype=latents.dtype)
-
-            # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-            # 7.1 Add image embeds for IP-Adapter
-            added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
-            # 7.2 Create tensor stating which controlnets to keep
-            controlnet_keep = []
-            for i in range(len(timesteps)):
-                keeps = [
-                    1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
-                    for s, e in zip(control_guidance_start, control_guidance_end)
-                ]
-                controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
-
-            # 7.3 Split into batches (HPU-specific step)
-            (
-                latents_batches,
-                text_embeddings_batches,
-                num_dummy_samples,
-            ) = GaudiStableDiffusionPipeline._split_inputs_into_batches(
-                batch_size,
-                latents,
-                prompt_embeds,
-                negative_prompt_embeds,
-            )
-
-            outputs = {
-                "images": [],
-                "has_nsfw_concept": [],
-            }
-            t0 = time.time()
-            t1 = t0
-
-            self._num_timesteps = len(timesteps)
-
-            hb_profiler = HabanaProfile(
-                warmup=profiling_warmup_steps,
-                active=profiling_steps,
-                record_shapes=False,
-            )
-            hb_profiler.start()
-
-            # 8. Denoising loop
-            throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
-            for j in self.progress_bar(range(num_batches)):
-                # The throughput is calculated from the 3rd iteration
-                # because compilation occurs in the first two iterations
-                if j == throughput_warmup_steps:
-                    t1 = time.time()
-
-                latents_batch = latents_batches[0]
-                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
-                text_embeddings_batch = text_embeddings_batches[0]
-                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
-                num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-                for i in range(num_inference_steps):
-                    t = timesteps[0]
-                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        torch.cat([latents_batch] * 2) if self.do_classifier_free_guidance else latents_batch
-                    )
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    # controlnet(s) inference
-                    if guess_mode and self.do_classifier_free_guidance:
-                        # Infer ControlNet only for the conditional batch.
-                        control_model_input = latents_batch
-                        control_model_input = self.scheduler.scale_model_input(control_model_input, t)
-                        controlnet_prompt_embeds = text_embeddings_batch.chunk(2)[1]
-                    else:
-                        control_model_input = latent_model_input
-                        controlnet_prompt_embeds = text_embeddings_batch
-
-                    if isinstance(controlnet_keep[i], list):
-                        cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
-                    else:
-                        controlnet_cond_scale = controlnet_conditioning_scale
-                        if isinstance(controlnet_cond_scale, list):
-                            controlnet_cond_scale = controlnet_cond_scale[0]
-                        cond_scale = controlnet_cond_scale * controlnet_keep[i]
-
-                    down_block_res_samples, mid_block_res_sample = self.controlnet_hpu(
-                        control_model_input,
-                        t,
-                        controlnet_prompt_embeds,
-                        image,
-                        cond_scale,
-                        guess_mode,
-                    )
-
-                    if guess_mode and self.do_classifier_free_guidance:
-                        # Infered ControlNet only for the conditional batch.
-                        # To apply the output of ControlNet to both the unconditional and conditional batches,
-                        # add 0 to the unconditional batch to keep it unchanged.
-                        down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
-                        mid_block_res_sample = torch.cat(
-                            [torch.zeros_like(mid_block_res_sample), mid_block_res_sample]
-                        )
-
-                    # predict the noise residual
-                    noise_pred = self.unet_hpu(
-                        latent_model_input,
-                        t,
-                        text_embeddings_batch,
-                        timestep_cond,
-                        self.cross_attention_kwargs,
-                        down_block_res_samples,
-                        mid_block_res_sample,
-                        added_cond_kwargs,
-                    )
-
-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents_batch = self.scheduler.step(
-                        noise_pred, t, latents_batch, **extra_step_kwargs, return_dict=False
-                    )[0]
-
-                    if not self.use_hpu_graphs:
-                        self.htcore.mark_step()
-
-                    if callback_on_step_end is not None:
-                        callback_kwargs = {}
-                        for k in callback_on_step_end_tensor_inputs:
-                            callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                        latents = callback_outputs.pop("latents", latents_batch)
-                        prompt_embeds = callback_outputs.pop("prompt_embeds", text_embeddings_batches)
-                        # negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        if callback is not None and i % callback_steps == 0:
-                            step_idx = i // getattr(self.scheduler, "order", 1)
-                            callback(step_idx, t, latents_batch)
-
-                    hb_profiler.step()
-
-                if not output_type == "latent":
-                    # 8. Post-processing
-                    output_image = self.vae.decode(
-                        latents_batch / self.vae.config.scaling_factor, return_dict=False, generator=generator
-                    )[0]
-                else:
-                    output_image = latents_batch
-                outputs["images"].append(output_image)
-
-                if not self.use_hpu_graphs:
-                    self.htcore.mark_step()
-
-            hb_profiler.stop()
-
-            speed_metrics_prefix = "generation"
-            speed_measures = speed_metrics(
-                split=speed_metrics_prefix,
-                start_time=t0,
-                num_samples=num_batches * batch_size
-                if t1 == t0
-                else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
-                start_time_after_warmup=t1,
-            )
-            logger.info(f"Speed metrics: {speed_measures}")
-
-            # Remove dummy generations if needed
-            if num_dummy_samples > 0:
-                outputs["images"][-1] = outputs["images"][-1][:-num_dummy_samples]
-
-            # Process generated images
-            for i, image in enumerate(outputs["images"][:]):
-                if i == 0:
-                    outputs["images"].clear()
-
-                if output_type == "latent":
-                    has_nsfw_concept = None
-                else:
-                    image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-                if has_nsfw_concept is None:
-                    do_denormalize = [True] * image.shape[0]
-                else:
-                    do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-                image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-                if output_type == "pil":
-                    outputs["images"] += image
-                else:
-                    outputs["images"] += [*image]
-
-                if has_nsfw_concept is not None:
-                    outputs["has_nsfw_concept"] += has_nsfw_concept
-                else:
-                    outputs["has_nsfw_concept"] = None
-
-            # Offload all models
-            self.maybe_free_model_hooks()
-
-            if not return_dict:
-                return (outputs["images"], outputs["has_nsfw_concept"])
-
-            return GaudiStableDiffusionPipelineOutput(
-                images=outputs["images"],
-                nsfw_content_detected=outputs["has_nsfw_concept"],
-                throughput=speed_measures[f"{speed_metrics_prefix}_samples_per_second"],
-            )
-
-    @torch.no_grad()
-    def unet_hpu(
-        self,
-        latent_model_input,
-        timestep,
-        encoder_hidden_states,
-        timestep_cond,
-        cross_attention_kwargs,
-        down_block_additional_residuals,
-        mid_block_additional_residual,
-        added_cond_kwargs,
-    ):
-        if self.use_hpu_graphs:
-            return self.unet_capture_replay(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states,
-                down_block_additional_residuals,
-                mid_block_additional_residual,
-            )
-        else:
-            return self.unet(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states=encoder_hidden_states,
-                timestep_cond=timestep_cond,
-                cross_attention_kwargs=cross_attention_kwargs,
-                down_block_additional_residuals=down_block_additional_residuals,
-                mid_block_additional_residual=mid_block_additional_residual,
-                added_cond_kwargs=added_cond_kwargs,
-                return_dict=False,
-            )[0]
-
-    @torch.no_grad()
-    def unet_capture_replay(
-        self,
-        latent_model_input,
-        timestep,
-        encoder_hidden_states,
-        down_block_additional_residuals,
-        mid_block_additional_residual,
-    ):
-        inputs = [
-            latent_model_input,
-            timestep,
-            encoder_hidden_states,
-            down_block_additional_residuals,
-            mid_block_additional_residual,
-            False,
-        ]
-        h = self.ht.hpu.graphs.input_hash(inputs)
-        cached = self.cache.get(h)
-
-        if cached is None:
-            # Capture the graph and cache it
-            with self.ht.hpu.stream(self.hpu_stream):
-                graph = self.ht.hpu.HPUGraph()
-                graph.capture_begin()
-                outputs = self.unet(
-                    inputs[0],
-                    inputs[1],
-                    inputs[2],
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    inputs[3],
-                    inputs[4],
-                    None,
-                    None,
-                    inputs[5],
-                )[0]
-                graph.capture_end()
-                graph_inputs = inputs
-                graph_outputs = outputs
-                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
-            return outputs
-
-        # Replay the cached graph with updated inputs
-        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
-        cached.graph.replay()
-        self.ht.core.hpu.default_stream().synchronize()
-
-        return cached.graph_outputs
-
-    @torch.no_grad()
-    def controlnet_hpu(
-        self,
-        control_model_input,
-        timestep,
-        encoder_hidden_states,
-        controlnet_cond,
-        conditioning_scale,
-        guess_mode,
-    ):
-        if self.use_hpu_graphs:
-            return self.controlnet_capture_replay(
-                control_model_input,
-                timestep,
-                encoder_hidden_states,
-                controlnet_cond,
-                conditioning_scale,
-                guess_mode,
-            )
-        else:
-            return self.controlnet(
-                control_model_input,
-                timestep,
-                encoder_hidden_states=encoder_hidden_states,
-                controlnet_cond=controlnet_cond,
-                conditioning_scale=conditioning_scale,
-                guess_mode=guess_mode,
-                return_dict=False,
-            )
-
-    @torch.no_grad()
-    def controlnet_capture_replay(
-        self,
-        control_model_input,
-        timestep,
-        encoder_hidden_states,
-        controlnet_cond,
-        conditioning_scale,
-        guess_mode,
-    ):
-        inputs = [
-            control_model_input,
-            timestep,
-            encoder_hidden_states,
-            controlnet_cond,
-            conditioning_scale,
-            guess_mode,
-            False,
-        ]
-        h = self.ht.hpu.graphs.input_hash(inputs)
-        cached = self.cache.get(h)
-
-        if cached is None:
-            # Capture the graph and cache it
-            with self.ht.hpu.stream(self.hpu_stream):
-                graph = self.ht.hpu.HPUGraph()
-                graph.capture_begin()
-                outputs = self.controlnet(
-                    inputs[0],
-                    inputs[1],
-                    inputs[2],
-                    inputs[3],
-                    inputs[4],
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    inputs[5],
-                    False,
-                )
-                graph.capture_end()
-                graph_inputs = inputs
-                graph_outputs = outputs
-                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
-            return outputs
-
-        # Replay the cached graph with updated inputs
-        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
-        cached.graph.replay()
-        self.ht.core.hpu.default_stream().synchronize()
-
-        return cached.graph_outputs
diff --git a/optimum/habana/diffusers/pipelines/pipeline_utils.py b/optimum/habana/diffusers/pipelines/pipeline_utils.py
deleted file mode 100644
index c2ffdb9cc5..0000000000
--- a/optimum/habana/diffusers/pipelines/pipeline_utils.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import importlib
-import inspect
-import os
-import sys
-from typing import Callable, Dict, Optional, Union
-
-import torch
-from diffusers.pipelines import DiffusionPipeline
-from diffusers.pipelines.pipeline_utils import _unwrap_model
-from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.torch_utils import is_compiled_module
-from huggingface_hub import create_repo
-
-from optimum.habana.utils import to_device_dtype
-from optimum.utils import logging
-
-from ...transformers.gaudi_configuration import GaudiConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-GAUDI_LOADABLE_CLASSES = {
-    "diffusers": {
-        "ModelMixin": ["save_pretrained", "from_pretrained"],
-        "SchedulerMixin": ["save_pretrained", "from_pretrained"],
-        "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
-        "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
-    },
-    "transformers": {
-        "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
-        "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
-        "PreTrainedModel": ["save_pretrained", "from_pretrained"],
-        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
-        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
-        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
-    },
-    "optimum.habana.diffusers.schedulers": {
-        "GaudiDDIMScheduler": ["save_pretrained", "from_pretrained"],
-        "GaudiEulerDiscreteScheduler": ["save_pretrained", "from_pretrained"],
-        "GaudiEulerAncestralDiscreteScheduler": ["save_pretrained", "from_pretrained"],
-    },
-}
-
-GAUDI_ALL_IMPORTABLE_CLASSES = {}
-for library in GAUDI_LOADABLE_CLASSES:
-    GAUDI_ALL_IMPORTABLE_CLASSES.update(GAUDI_LOADABLE_CLASSES[library])
-
-
-def _fetch_class_library_tuple(module):
-    # import it here to avoid circular import
-    from diffusers import pipelines
-
-    # register the config from the original module, not the dynamo compiled one
-    not_compiled_module = _unwrap_model(module)
-    library = not_compiled_module.__module__.split(".")[0]
-    if library == "optimum":
-        library = "optimum.habana.diffusers.schedulers"
-
-    # check if the module is a pipeline module
-    module_path_items = not_compiled_module.__module__.split(".")
-    pipeline_dir = module_path_items[-2] if len(module_path_items) > 2 else None
-
-    path = not_compiled_module.__module__.split(".")
-    is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
-
-    # if library is not in GAUDI_LOADABLE_CLASSES, then it is a custom module.
-    # Or if it's a pipeline module, then the module is inside the pipeline
-    # folder so we set the library to module name.
-    if is_pipeline_module:
-        library = pipeline_dir
-    elif library not in GAUDI_LOADABLE_CLASSES:
-        library = not_compiled_module.__module__
-
-    # retrieve class_name
-    class_name = not_compiled_module.__class__.__name__
-
-    return (library, class_name)
-
-
-class GaudiDiffusionPipeline(DiffusionPipeline):
-    """
-    Extends the [`DiffusionPipeline`](https://huggingface.co/docs/diffusers/api/diffusion_pipeline) class:
-    - The pipeline is initialized on Gaudi if `use_habana=True`.
-    - The pipeline's Gaudi configuration is saved and pushed to the hub.
-
-    Args:
-        use_habana (bool, defaults to `False`):
-            Whether to use Gaudi (`True`) or CPU (`False`).
-        use_hpu_graphs (bool, defaults to `False`):
-            Whether to use HPU graphs or not.
-        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
-            Gaudi configuration to use. Can be a string to download it from the Hub.
-            Or a previously initialized config can be passed.
-        bf16_full_eval (bool, defaults to `False`):
-            Whether to use full bfloat16 evaluation instead of 32-bit.
-            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
-    """
-
-    def __init__(
-        self,
-        use_habana: bool = False,
-        use_hpu_graphs: bool = False,
-        gaudi_config: Union[str, GaudiConfig] = None,
-        bf16_full_eval: bool = False,
-    ):
-        DiffusionPipeline.__init__(self)
-
-        self.use_habana = use_habana
-        if self.use_habana:
-            self.use_hpu_graphs = use_hpu_graphs
-            if self.use_hpu_graphs:
-                logger.info("Enabled HPU graphs.")
-            else:
-                logger.info("Enabled lazy mode because `use_hpu_graphs=False`.")
-
-            self._device = torch.device("hpu")
-
-            if isinstance(gaudi_config, str):
-                # Config from the Hub
-                self.gaudi_config = GaudiConfig.from_pretrained(gaudi_config)
-            elif isinstance(gaudi_config, GaudiConfig):
-                # Config already initialized
-                self.gaudi_config = copy.deepcopy(gaudi_config)
-            else:
-                raise ValueError(
-                    f"`gaudi_config` must be a string or a GaudiConfig object but is {type(gaudi_config)}."
-                )
-
-            if self.gaudi_config.use_torch_autocast:
-                if bf16_full_eval:
-                    logger.warning(
-                        "`use_torch_autocast` is True in the given Gaudi configuration but "
-                        "`torch_dtype=torch.bfloat16` was given. Disabling mixed precision and continuing in bf16 only."
-                    )
-                    self.gaudi_config.use_torch_autocast = False
-                else:
-                    self.gaudi_config.declare_autocast_bf16_fp32_ops()
-
-            # Workaround for Synapse 1.11 for full bf16 and Torch Autocast
-            if bf16_full_eval or self.gaudi_config.use_torch_autocast:
-                import diffusers
-
-                from ..models import (
-                    gaudi_unet_2d_condition_model_forward,
-                )
-
-                diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.forward = (
-                    gaudi_unet_2d_condition_model_forward
-                )
-
-            if self.use_hpu_graphs:
-                try:
-                    import habana_frameworks.torch as ht
-                except ImportError as error:
-                    error.msg = f"Could not import habana_frameworks.torch. {error.msg}."
-                    raise error
-                self.ht = ht
-                self.hpu_stream = ht.hpu.Stream()
-                self.cache = {}
-            else:
-                try:
-                    import habana_frameworks.torch.core as htcore
-                except ImportError as error:
-                    error.msg = f"Could not import habana_frameworks.torch.core. {error.msg}."
-                    raise error
-                self.htcore = htcore
-        else:
-            if use_hpu_graphs:
-                raise ValueError(
-                    "`use_hpu_graphs` is True but `use_habana` is False, please set `use_habana=True` to use HPU"
-                    " graphs."
-                )
-            if gaudi_config is not None:
-                raise ValueError(
-                    "Got a non-None `gaudi_config` but `use_habana` is False, please set `use_habana=True` to use this"
-                    " Gaudi configuration."
-                )
-            logger.info("Running on CPU.")
-            self._device = torch.device("cpu")
-
-    def register_modules(self, **kwargs):
-        for name, module in kwargs.items():
-            # retrieve library
-            if module is None or isinstance(module, (tuple, list)) and module[0] is None:
-                register_dict = {name: (None, None)}
-            else:
-                library, class_name = _fetch_class_library_tuple(module)
-                register_dict = {name: (library, class_name)}
-
-            # save model index config
-            self.register_to_config(**register_dict)
-
-            # set models
-            setattr(self, name, module)
-
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        safe_serialization: bool = True,
-        variant: Optional[str] = None,
-        push_to_hub: bool = False,
-        **kwargs,
-    ):
-        """
-        Save the pipeline and Gaudi configurations.
-        More information [here](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.save_pretrained).
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-            variant (`str`, *optional*):
-                If specified, weights are saved in the format pytorch_model.<variant>.bin.
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
-                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
-                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
-                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
-        model_index_dict = dict(self.config)
-        model_index_dict.pop("_class_name", None)
-        model_index_dict.pop("_diffusers_version", None)
-        model_index_dict.pop("_module", None)
-        model_index_dict.pop("_name_or_path", None)
-
-        if push_to_hub:
-            commit_message = kwargs.pop("commit_message", None)
-            private = kwargs.pop("private", False)
-            create_pr = kwargs.pop("create_pr", False)
-            token = kwargs.pop("token", None)
-            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
-
-        expected_modules, optional_kwargs = self._get_signature_keys(self)
-
-        def is_saveable_module(name, value):
-            if name not in expected_modules:
-                return False
-            if name in self._optional_components and value[0] is None:
-                return False
-            return True
-
-        model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
-
-        for pipeline_component_name in model_index_dict.keys():
-            sub_model = getattr(self, pipeline_component_name)
-            model_cls = sub_model.__class__
-
-            # Dynamo wraps the original model in a private class.
-            # I didn't find a public API to get the original class.
-            if is_compiled_module(sub_model):
-                sub_model = _unwrap_model(sub_model)
-                model_cls = sub_model.__class__
-
-            save_method_name = None
-            # search for the model's base class in GAUDI_LOADABLE_CLASSES
-            for library_name, library_classes in GAUDI_LOADABLE_CLASSES.items():
-                if library_name in sys.modules:
-                    library = importlib.import_module(library_name)
-                else:
-                    logger.info(
-                        f"{library_name} is not installed. Cannot save {pipeline_component_name} as {library_classes} from {library_name}"
-                    )
-
-                for base_class, save_load_methods in library_classes.items():
-                    class_candidate = getattr(library, base_class, None)
-                    if class_candidate is not None and issubclass(model_cls, class_candidate):
-                        # if we found a suitable base class in GAUDI_LOADABLE_CLASSES then grab its save method
-                        save_method_name = save_load_methods[0]
-                        break
-                if save_method_name is not None:
-                    break
-
-            if save_method_name is None:
-                logger.warn(f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved.")
-                # make sure that unsaveable components are not tried to be loaded afterward
-                self.register_to_config(**{pipeline_component_name: (None, None)})
-                continue
-
-            save_method = getattr(sub_model, save_method_name)
-
-            # Call the save method with the argument safe_serialization only if it's supported
-            save_method_signature = inspect.signature(save_method)
-            save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
-            save_method_accept_variant = "variant" in save_method_signature.parameters
-
-            save_kwargs = {}
-            if save_method_accept_safe:
-                save_kwargs["safe_serialization"] = safe_serialization
-            if save_method_accept_variant:
-                save_kwargs["variant"] = variant
-
-            save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
-
-        # finally save the config
-        self.save_config(save_directory)
-        if hasattr(self, "gaudi_config"):
-            self.gaudi_config.save_pretrained(save_directory)
-
-        if push_to_hub:
-            # Create a new empty model card and eventually tag it
-            model_card = load_or_create_model_card(repo_id, token=token, is_pipeline=True)
-            model_card = populate_model_card(model_card)
-            model_card.save(os.path.join(save_directory, "README.md"))
-
-            self._upload_folder(
-                save_directory,
-                repo_id,
-                token=token,
-                commit_message=commit_message,
-                create_pr=create_pr,
-            )
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        """
-        More information [here](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained).
-        """
-
-        # Set the correct log level depending on the node
-        # Already done in super().init() but we have to do it again
-        # because we use optimum.utils.logging here and not
-        # diffusers.utils.logging
-        log_level = kwargs.pop("log_level", logging.INFO)
-        logging.set_verbosity(log_level)
-        logging.enable_default_handler()
-        logging.enable_explicit_format()
-
-        # Import diffusers.pipelines.pipeline_utils to override the values of LOADABLE_CLASSES and ALL_IMPORTABLE_CLASSES
-        import diffusers.pipelines.pipeline_utils
-
-        diffusers.pipelines.pipeline_utils.LOADABLE_CLASSES = GAUDI_LOADABLE_CLASSES
-        diffusers.pipelines.pipeline_utils.ALL_IMPORTABLE_CLASSES = GAUDI_ALL_IMPORTABLE_CLASSES
-
-        # Define a new kwarg here to know in the __init__ whether to use full bf16 precision or not
-        bf16_full_eval = kwargs.get("torch_dtype", None) == torch.bfloat16
-        kwargs["bf16_full_eval"] = bf16_full_eval
-
-        return super().from_pretrained(
-            pretrained_model_name_or_path,
-            **kwargs,
-        )
-
-    @classmethod
-    def save_lora_weights(
-        cls,
-        save_directory: Union[str, os.PathLike],
-        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = True,
-    ):
-        # Move the state dict from HPU to CPU before saving
-        if unet_lora_layers:
-            unet_lora_layers = to_device_dtype(unet_lora_layers, target_device=torch.device("cpu"))
-        if text_encoder_lora_layers:
-            text_encoder_lora_layers = to_device_dtype(text_encoder_lora_layers, target_device=torch.device("cpu"))
-        if text_encoder_2_lora_layers:
-            text_encoder_2_lora_layers = to_device_dtype(text_encoder_2_lora_layers, target_device=torch.device("cpu"))
-        return super().save_lora_weights(
-            save_directory,
-            unet_lora_layers,
-            text_encoder_lora_layers,
-            text_encoder_2_lora_layers,
-            is_main_process,
-            weight_name,
-            save_function,
-            safe_serialization,
-        )
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
deleted file mode 100644
index 1096dec9ab..0000000000
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ /dev/null
@@ -1,705 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import time
-from dataclasses import dataclass
-from math import ceil
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from diffusers.image_processor import PipelineImageInput
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline, StableDiffusionSafetyChecker
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import BaseOutput, deprecate
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-
-from optimum.utils import logging
-
-from ....transformers.gaudi_configuration import GaudiConfig
-from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
-from ..pipeline_utils import GaudiDiffusionPipeline
-
-
-logger = logging.get_logger(__name__)
-
-
-@dataclass
-class GaudiStableDiffusionPipelineOutput(BaseOutput):
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
-    throughput: float
-
-
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    **kwargs,
-):
-    """
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
-            `timesteps` must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
-                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
-                must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device="cpu", **kwargs)
-        timesteps = scheduler.timesteps.to(device)
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device="cpu", **kwargs)
-        timesteps = scheduler.timesteps.to(device)
-
-    # Handles the case where the scheduler cannot implement reset_timestep_dependent_params()
-    # Example: UniPCMultiStepScheduler used for inference in ControlNet training as it has non-linear accesses to timestep dependent parameter: sigma.
-    if hasattr(scheduler, "reset_timestep_dependent_params") and callable(scheduler.reset_timestep_dependent_params):
-        scheduler.reset_timestep_dependent_params()
-    return timesteps, num_inference_steps
-
-
-class GaudiStableDiffusionPipeline(GaudiDiffusionPipeline, StableDiffusionPipeline):
-    """
-    Adapted from: https://github.com/huggingface/diffusers/blob/v0.23.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L73
-    - Generation is performed by batches
-    - Two `mark_step()` were added to add support for lazy mode
-    - Added support for HPU graphs
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
-        text_encoder ([`~transformers.CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer (`~transformers.CLIPTokenizer`):
-            A `CLIPTokenizer` to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A `UNet2DConditionModel` to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
-            about a model's potential harms.
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
-        use_habana (bool, defaults to `False`):
-            Whether to use Gaudi (`True`) or CPU (`False`).
-        use_hpu_graphs (bool, defaults to `False`):
-            Whether to use HPU graphs or not.
-        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
-            Gaudi configuration to use. Can be a string to download it from the Hub.
-            Or a previously initialized config can be passed.
-        bf16_full_eval (bool, defaults to `False`):
-            Whether to use full bfloat16 evaluation instead of 32-bit.
-            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        requires_safety_checker: bool = True,
-        use_habana: bool = False,
-        use_hpu_graphs: bool = False,
-        gaudi_config: Union[str, GaudiConfig] = None,
-        bf16_full_eval: bool = False,
-    ):
-        GaudiDiffusionPipeline.__init__(
-            self,
-            use_habana,
-            use_hpu_graphs,
-            gaudi_config,
-            bf16_full_eval,
-        )
-
-        # Workaround for Synapse 1.11 for full bf16
-        if bf16_full_eval:
-            unet.conv_in.float()
-
-        StableDiffusionPipeline.__init__(
-            self,
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            scheduler,
-            safety_checker,
-            feature_extractor,
-            image_encoder,
-            requires_safety_checker,
-        )
-
-        self.to(self._device)
-
-    def prepare_latents(self, num_images, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (num_images, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != num_images:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective number"
-                f" of images of {num_images}. Make sure the number of images matches the length of the generators."
-            )
-
-        if latents is None:
-            # torch.randn is broken on HPU so running it on CPU
-            rand_device = "cpu" if device.type == "hpu" else device
-            if isinstance(generator, list):
-                shape = (1,) + shape[1:]
-                latents = [
-                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
-                    for i in range(num_images)
-                ]
-                latents = torch.cat(latents, dim=0).to(device)
-            else:
-                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @classmethod
-    def _split_inputs_into_batches(cls, batch_size, latents, prompt_embeds, negative_prompt_embeds):
-        # Use torch.split to generate num_batches batches of size batch_size
-        latents_batches = list(torch.split(latents, batch_size))
-        prompt_embeds_batches = list(torch.split(prompt_embeds, batch_size))
-        if negative_prompt_embeds is not None:
-            negative_prompt_embeds_batches = list(torch.split(negative_prompt_embeds, batch_size))
-
-        # If the last batch has less samples than batch_size, pad it with dummy samples
-        num_dummy_samples = 0
-        if latents_batches[-1].shape[0] < batch_size:
-            num_dummy_samples = batch_size - latents_batches[-1].shape[0]
-            # Pad latents_batches
-            sequence_to_stack = (latents_batches[-1],) + tuple(
-                torch.zeros_like(latents_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-            )
-            latents_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad prompt_embeds_batches
-            sequence_to_stack = (prompt_embeds_batches[-1],) + tuple(
-                torch.zeros_like(prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-            )
-            prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad negative_prompt_embeds_batches if necessary
-            if negative_prompt_embeds is not None:
-                sequence_to_stack = (negative_prompt_embeds_batches[-1],) + tuple(
-                    torch.zeros_like(negative_prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-                )
-                negative_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
-
-        # Stack batches in the same tensor
-        latents_batches = torch.stack(latents_batches)
-        if negative_prompt_embeds is not None:
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            for i, (negative_prompt_embeds_batch, prompt_embeds_batch) in enumerate(
-                zip(negative_prompt_embeds_batches, prompt_embeds_batches[:])
-            ):
-                prompt_embeds_batches[i] = torch.cat([negative_prompt_embeds_batch, prompt_embeds_batch])
-        prompt_embeds_batches = torch.stack(prompt_embeds_batches)
-
-        return latents_batches, prompt_embeds_batches, num_dummy_samples
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        batch_size: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        **kwargs,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The height in pixels of the generated images.
-            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The width in pixels of the generated images.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images in a batch.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
-                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            guidance_rescale (`float`, *optional*, defaults to 0.0):
-                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
-                using zero terminal SNR.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            profiling_warmup_steps (`int`, *optional*):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*):
-                Number of steps to be captured when enabling profiling.
-
-        Returns:
-            [`~diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated images and the
-                second element is a list of `bool`s indicating whether the corresponding generated image contains
-                "not-safe-for-work" (nsfw) content.
-        """
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
-            # 0. Default height and width to unet
-            height = height or self.unet.config.sample_size * self.vae_scale_factor
-            width = width or self.unet.config.sample_size * self.vae_scale_factor
-            # to deal with lora scaling and other possible forward hooks
-
-            # 1. Check inputs. Raise error if not correct
-            self.check_inputs(
-                prompt,
-                height,
-                width,
-                callback_steps,
-                negative_prompt,
-                prompt_embeds,
-                negative_prompt_embeds,
-                callback_on_step_end_tensor_inputs,
-            )
-
-            self._guidance_scale = guidance_scale
-            self._guidance_rescale = guidance_rescale
-            self._clip_skip = clip_skip
-            self._cross_attention_kwargs = cross_attention_kwargs
-            self._interrupt = False
-
-            # 2. Define call parameters
-            if prompt is not None and isinstance(prompt, str):
-                num_prompts = 1
-            elif prompt is not None and isinstance(prompt, list):
-                num_prompts = len(prompt)
-            else:
-                num_prompts = prompt_embeds.shape[0]
-            num_batches = ceil((num_images_per_prompt * num_prompts) / batch_size)
-            logger.info(
-                f"{num_prompts} prompt(s) received, {num_images_per_prompt} generation(s) per prompt,"
-                f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
-            )
-            if num_batches < 3:
-                logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
-            device = self._execution_device
-
-            # 3. Encode input prompt
-            lora_scale = (
-                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
-            )
-
-            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-                prompt,
-                device,
-                num_images_per_prompt,
-                self.do_classifier_free_guidance,
-                negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                lora_scale=lora_scale,
-                clip_skip=self.clip_skip,
-            )
-
-            if ip_adapter_image is not None:
-                image_embeds = self.prepare_ip_adapter_image_embeds(
-                    ip_adapter_image, device, batch_size * num_images_per_prompt
-                )
-
-            # 4. Prepare timesteps
-            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
-
-            # 5. Prepare latent variables
-            num_channels_latents = self.unet.config.in_channels
-            latents = self.prepare_latents(
-                num_prompts * num_images_per_prompt,
-                num_channels_latents,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-            )
-
-            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-            # 6.1 Add image embeds for IP-Adapter
-            added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
-            # 6.2 Optionally get Guidance Scale Embedding
-            timestep_cond = None
-            if self.unet.config.time_cond_proj_dim is not None:
-                guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
-                    batch_size * num_images_per_prompt
-                )
-                timestep_cond = self.get_guidance_scale_embedding(
-                    guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-                ).to(device=device, dtype=latents.dtype)
-
-            # 7. Split into batches (HPU-specific step)
-            latents_batches, text_embeddings_batches, num_dummy_samples = self._split_inputs_into_batches(
-                batch_size,
-                latents,
-                prompt_embeds,
-                negative_prompt_embeds,
-            )
-
-            outputs = {
-                "images": [],
-                "has_nsfw_concept": [],
-            }
-            t0 = time.time()
-            t1 = t0
-
-            self._num_timesteps = len(timesteps)
-
-            hb_profiler = HabanaProfile(
-                warmup=profiling_warmup_steps,
-                active=profiling_steps,
-                record_shapes=False,
-            )
-            hb_profiler.start()
-
-            # 8. Denoising loop
-            throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
-            use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
-            )
-
-            for j in self.progress_bar(range(num_batches)):
-                # The throughput is calculated from the 3rd iteration
-                # because compilation occurs in the first two iterations
-                if j == throughput_warmup_steps:
-                    t1 = time.time()
-                if use_warmup_inference_steps:
-                    t0_inf = time.time()
-
-                latents_batch = latents_batches[0]
-                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
-                text_embeddings_batch = text_embeddings_batches[0]
-                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
-
-                for i in range(len(timesteps)):
-                    if use_warmup_inference_steps and i == throughput_warmup_steps:
-                        t1_inf = time.time()
-                        t1 += t1_inf - t0_inf
-
-                    if self.interrupt:
-                        continue
-                    timestep = timesteps[0]
-                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        torch.cat([latents_batch] * 2) if self.do_classifier_free_guidance else latents_batch
-                    )
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
-
-                    # predict the noise residual
-                    noise_pred = self.unet_hpu(
-                        latent_model_input,
-                        timestep,
-                        text_embeddings_batch,
-                        timestep_cond,
-                        self.cross_attention_kwargs,
-                        added_cond_kwargs,
-                    )
-
-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
-                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                        noise_pred = rescale_noise_cfg(
-                            noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
-                        )
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents_batch = self.scheduler.step(
-                        noise_pred, timestep, latents_batch, **extra_step_kwargs, return_dict=False
-                    )[0]
-
-                    if not self.use_hpu_graphs:
-                        self.htcore.mark_step()
-
-                    if callback_on_step_end is not None:
-                        callback_kwargs = {}
-                        for k in callback_on_step_end_tensor_inputs:
-                            callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, timestep, callback_kwargs)
-
-                        latents_batch = callback_outputs.pop("latents", latents_batch)
-                        text_embeddings_batch = callback_outputs.pop("prompt_embeds", text_embeddings_batch)
-                        # negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                    # call the callback, if provided
-                    if callback is not None and i % callback_steps == 0:
-                        step_idx = i // getattr(self.scheduler, "order", 1)
-                        callback(step_idx, timestep, latents_batch)
-
-                    hb_profiler.step()
-
-                if use_warmup_inference_steps:
-                    t1 = warmup_inference_steps_time_adjustment(
-                        t1, t1_inf, num_inference_steps, throughput_warmup_steps
-                    )
-
-                if not output_type == "latent":
-                    # 8. Post-processing
-                    image = self.vae.decode(
-                        latents_batch / self.vae.config.scaling_factor, return_dict=False, generator=generator
-                    )[0]
-                else:
-                    image = latents_batch
-                outputs["images"].append(image)
-
-                if not self.use_hpu_graphs:
-                    self.htcore.mark_step()
-
-            hb_profiler.stop()
-
-            speed_metrics_prefix = "generation"
-            speed_measures = speed_metrics(
-                split=speed_metrics_prefix,
-                start_time=t0,
-                num_samples=num_batches * batch_size
-                if t1 == t0 or use_warmup_inference_steps
-                else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
-                start_time_after_warmup=t1,
-            )
-            logger.info(f"Speed metrics: {speed_measures}")
-
-            # Remove dummy generations if needed
-            if num_dummy_samples > 0:
-                outputs["images"][-1] = outputs["images"][-1][:-num_dummy_samples]
-
-            # Process generated images
-            for i, image in enumerate(outputs["images"][:]):
-                if i == 0:
-                    outputs["images"].clear()
-
-                if output_type == "latent":
-                    has_nsfw_concept = None
-                else:
-                    image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-                if has_nsfw_concept is None:
-                    do_denormalize = [True] * image.shape[0]
-                else:
-                    do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-                image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-                if output_type == "pil" and isinstance(image, list):
-                    outputs["images"] += image
-                elif output_type in ["np", "numpy"] and isinstance(image, np.ndarray):
-                    if len(outputs["images"]) == 0:
-                        outputs["images"] = image
-                    else:
-                        outputs["images"] = np.concatenate((outputs["images"], image), axis=0)
-                else:
-                    if len(outputs["images"]) == 0:
-                        outputs["images"] = image
-                    else:
-                        outputs["images"] = torch.cat((outputs["images"], image), 0)
-
-                if has_nsfw_concept is not None:
-                    outputs["has_nsfw_concept"] += has_nsfw_concept
-                else:
-                    outputs["has_nsfw_concept"] = None
-
-            # Offload all models
-            self.maybe_free_model_hooks()
-
-            if not return_dict:
-                return (outputs["images"], outputs["has_nsfw_concept"])
-
-            return GaudiStableDiffusionPipelineOutput(
-                images=outputs["images"],
-                nsfw_content_detected=outputs["has_nsfw_concept"],
-                throughput=speed_measures[f"{speed_metrics_prefix}_samples_per_second"],
-            )
-
-    @torch.no_grad()
-    def unet_hpu(
-        self,
-        latent_model_input,
-        timestep,
-        encoder_hidden_states,
-        timestep_cond,
-        cross_attention_kwargs,
-        added_cond_kwargs,
-    ):
-        if self.use_hpu_graphs:
-            return self.capture_replay(latent_model_input, timestep, encoder_hidden_states)
-        else:
-            return self.unet(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states=encoder_hidden_states,
-                timestep_cond=timestep_cond,
-                cross_attention_kwargs=cross_attention_kwargs,
-                added_cond_kwargs=added_cond_kwargs,
-                return_dict=False,
-            )[0]
-
-    @torch.no_grad()
-    def capture_replay(self, latent_model_input, timestep, encoder_hidden_states):
-        inputs = [latent_model_input, timestep, encoder_hidden_states, False]
-        h = self.ht.hpu.graphs.input_hash(inputs)
-        cached = self.cache.get(h)
-
-        if cached is None:
-            # Capture the graph and cache it
-            with self.ht.hpu.stream(self.hpu_stream):
-                graph = self.ht.hpu.HPUGraph()
-                graph.capture_begin()
-                outputs = self.unet(inputs[0], inputs[1], inputs[2], inputs[3])[0]
-                graph.capture_end()
-                graph_inputs = inputs
-                graph_outputs = outputs
-                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
-            return outputs
-
-        # Replay the cached graph with updated inputs
-        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
-        cached.graph.replay()
-        self.ht.core.hpu.default_stream().synchronize()
-
-        return cached.graph_outputs
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
deleted file mode 100644
index b60b6d89fc..0000000000
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ /dev/null
@@ -1,513 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-from dataclasses import dataclass
-from math import ceil
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from diffusers.image_processor import PipelineImageInput
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines import StableDiffusionLDM3DPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import BaseOutput
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-
-from optimum.utils import logging
-
-from ....transformers.gaudi_configuration import GaudiConfig
-from ....utils import speed_metrics, warmup_inference_steps_time_adjustment
-from ..pipeline_utils import GaudiDiffusionPipeline
-from .pipeline_stable_diffusion import GaudiStableDiffusionPipeline
-
-
-logger = logging.get_logger(__name__)
-
-
-@dataclass
-class GaudiStableDiffusionLDM3DPipelineOutput(BaseOutput):
-    rgb: Union[List[PIL.Image.Image], np.ndarray]
-    depth: Union[List[PIL.Image.Image], np.ndarray]
-    throughput: float
-    nsfw_content_detected: Optional[List[bool]]
-
-
-class GaudiStableDiffusionLDM3DPipeline(GaudiDiffusionPipeline, StableDiffusionLDM3DPipeline):
-    """
-    Adapted from: https://github.com/huggingface/diffusers/blob/v0.23.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py#L84
-    - Generation is performed by batches
-    - Two `mark_step()` were added to add support for lazy mode
-    - Added support for HPU graphs
-    - Adjusted original Stable Diffusion to match with the LDM3D implementation (input and output being different)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
-        text_encoder ([`~transformers.CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer ([`~transformers.CLIPTokenizer`]):
-            A `CLIPTokenizer` to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A `UNet2DConditionModel` to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
-            about a model's potential harms.
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
-        use_habana (bool, defaults to `False`):
-            Whether to use Gaudi (`True`) or CPU (`False`).
-        use_hpu_graphs (bool, defaults to `False`):
-            Whether to use HPU graphs or not.
-        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
-            Gaudi configuration to use. Can be a string to download it from the Hub.
-            Or a previously initialized config can be passed.
-        bf16_full_eval (bool, defaults to `False`):
-            Whether to use full bfloat16 evaluation instead of 32-bit.
-            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        image_encoder: Optional[CLIPVisionModelWithProjection],
-        requires_safety_checker: bool = True,
-        use_habana: bool = False,
-        use_hpu_graphs: bool = False,
-        gaudi_config: Union[str, GaudiConfig] = None,
-        bf16_full_eval: bool = False,
-    ):
-        GaudiDiffusionPipeline.__init__(
-            self,
-            use_habana,
-            use_hpu_graphs,
-            gaudi_config,
-            bf16_full_eval,
-        )
-
-        # Workaround for Synapse 1.11 for full bf16
-        if bf16_full_eval:
-            unet.conv_in.float()
-
-        StableDiffusionLDM3DPipeline.__init__(
-            self,
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            scheduler,
-            safety_checker,
-            feature_extractor,
-            image_encoder,
-            requires_safety_checker,
-        )
-
-        self.to(self._device)
-
-    def prepare_latents(self, num_images, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (num_images, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != num_images:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective number"
-                f" of images of {num_images}. Make sure the number of images matches the length of the generators."
-            )
-
-        if latents is None:
-            # torch.randn is broken on HPU so running it on CPU
-            rand_device = "cpu" if device.type == "hpu" else device
-            if isinstance(generator, list):
-                shape = (1,) + shape[1:]
-                latents = [
-                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
-                    for i in range(num_images)
-                ]
-                latents = torch.cat(latents, dim=0).to(device)
-            else:
-                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        batch_size: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: Optional[int] = None,
-        **kwargs,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 5.0):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
-                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-
-        Returns:
-            [`~diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated images and the
-                second element is a list of `bool`s indicating whether the corresponding generated image contains
-                "not-safe-for-work" (nsfw) content.
-        """
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
-            # 0. Default height and width to unet
-            height = height or self.unet.config.sample_size * self.vae_scale_factor
-            width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-            # 1. Check inputs. Raise error if not correct
-            self.check_inputs(
-                prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-            )
-
-            # 2. Define call parameters
-            if prompt is not None and isinstance(prompt, str):
-                num_prompts = 1
-            elif prompt is not None and isinstance(prompt, list):
-                num_prompts = len(prompt)
-            else:
-                num_prompts = prompt_embeds.shape[0]
-            num_batches = ceil((num_images_per_prompt * num_prompts) / batch_size)
-            logger.info(
-                f"{num_prompts} prompt(s) received, {num_images_per_prompt} generation(s) per prompt,"
-                f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
-            )
-            if num_batches < 3:
-                logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
-            device = self._execution_device
-            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-            # corresponds to doing no classifier free guidance.
-            do_classifier_free_guidance = guidance_scale > 1.0
-
-            if ip_adapter_image is not None:
-                image_embeds = self.prepare_ip_adapter_image_embeds(
-                    ip_adapter_image, device, batch_size * num_images_per_prompt
-                )
-
-            # 3. Encode input prompt
-            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-                prompt,
-                device,
-                num_images_per_prompt,
-                do_classifier_free_guidance,
-                negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                clip_skip=clip_skip,
-            )
-
-            # 4. Prepare timesteps
-            self.scheduler.set_timesteps(num_inference_steps, device="cpu")
-            timesteps = self.scheduler.timesteps.to(device)
-            self.scheduler.reset_timestep_dependent_params()
-
-            # 5. Prepare latent variables
-            num_channels_latents = self.unet.config.in_channels
-            latents = self.prepare_latents(
-                num_prompts * num_images_per_prompt,
-                num_channels_latents,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-            )
-
-            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-            # 6.1 Add image embeds for IP-Adapter
-            added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
-            # 7. Split into batches (HPU-specific step)
-            (
-                latents_batches,
-                text_embeddings_batches,
-                num_dummy_samples,
-            ) = GaudiStableDiffusionPipeline._split_inputs_into_batches(
-                batch_size,
-                latents,
-                prompt_embeds,
-                negative_prompt_embeds,
-            )
-
-            outputs = {
-                "images": [],
-                "depths": [],
-                "has_nsfw_concept": [],
-            }
-            t0 = time.time()
-            t1 = t0
-
-            # 8. Denoising loop
-            throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
-            use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
-            )
-
-            for j in self.progress_bar(range(num_batches)):
-                # The throughput is calculated from the 3rd iteration
-                # because compilation occurs in the first two iterations
-                if j == throughput_warmup_steps:
-                    t1 = time.time()
-                if use_warmup_inference_steps:
-                    t0_inf = time.time()
-
-                latents_batch = latents_batches[0]
-                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
-                text_embeddings_batch = text_embeddings_batches[0]
-                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
-
-                for i in range(len(timesteps)):
-                    if use_warmup_inference_steps and i == throughput_warmup_steps:
-                        t1_inf = time.time()
-                        t1 += t1_inf - t0_inf
-
-                    timestep = timesteps[0]
-                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        torch.cat([latents_batch] * 2) if do_classifier_free_guidance else latents_batch
-                    )
-                    # latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
-
-                    # predict the noise residual
-                    noise_pred = self.unet_hpu(
-                        latent_model_input,
-                        timestep,
-                        text_embeddings_batch,
-                        cross_attention_kwargs,
-                        added_cond_kwargs,
-                    )
-
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents_batch = self.scheduler.step(
-                        noise_pred, timestep, latents_batch, **extra_step_kwargs, return_dict=False
-                    )[0]
-
-                    if not self.use_hpu_graphs:
-                        self.htcore.mark_step()
-
-                    # call the callback, if provided
-                    if callback is not None and i % callback_steps == 0:
-                        step_idx = i // getattr(self.scheduler, "order", 1)
-                        callback(step_idx, timestep, latents_batch)
-
-                if use_warmup_inference_steps:
-                    t1 = warmup_inference_steps_time_adjustment(
-                        t1, t1_inf, num_inference_steps, throughput_warmup_steps
-                    )
-
-                if not output_type == "latent":
-                    # 8. Post-processing
-                    image = self.vae.decode(latents_batch / self.vae.config.scaling_factor, return_dict=False)[0]
-                else:
-                    image = latents_batch
-                outputs["images"].append(image)
-
-                if not self.use_hpu_graphs:
-                    self.htcore.mark_step()
-
-            speed_metrics_prefix = "generation"
-            speed_measures = speed_metrics(
-                split=speed_metrics_prefix,
-                start_time=t0,
-                num_samples=num_batches * batch_size
-                if t1 == t0 or use_warmup_inference_steps
-                else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
-                start_time_after_warmup=t1,
-            )
-            logger.info(f"Speed metrics: {speed_measures}")
-
-            # Remove dummy generations if needed
-            if num_dummy_samples > 0:
-                outputs["images"][-1] = outputs["images"][-1][:-num_dummy_samples]
-
-            # Process generated images
-            for i, image in enumerate(outputs["images"][:]):
-                if i == 0:
-                    outputs["images"].clear()
-
-                if output_type == "latent":
-                    has_nsfw_concept = None
-                else:
-                    image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-                if has_nsfw_concept is None:
-                    do_denormalize = [True] * image.shape[0]
-                else:
-                    do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-                rgb, depth = self.image_processor.postprocess(
-                    image, output_type=output_type, do_denormalize=do_denormalize
-                )
-
-                if output_type == "pil":
-                    outputs["images"] += rgb
-                    outputs["depths"] += depth
-                else:
-                    outputs["images"] += [*rgb]
-                    outputs["depths"] += [*depth]
-
-                if has_nsfw_concept is not None:
-                    outputs["has_nsfw_concept"] += has_nsfw_concept
-                else:
-                    outputs["has_nsfw_concept"] = None
-
-            # Offload all models
-            self.maybe_free_model_hooks()
-
-            if not return_dict:
-                return ((rgb, depth), has_nsfw_concept)
-
-            return GaudiStableDiffusionLDM3DPipelineOutput(
-                rgb=outputs["images"],
-                depth=outputs["depths"],
-                nsfw_content_detected=has_nsfw_concept,
-                throughput=speed_measures[f"{speed_metrics_prefix}_samples_per_second"],
-            )
-
-    @torch.no_grad()
-    def unet_hpu(self, latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs, added_cond_kwargs):
-        if self.use_hpu_graphs:
-            return self.capture_replay(latent_model_input, timestep, encoder_hidden_states)
-        else:
-            return self.unet(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs,
-                added_cond_kwargs=added_cond_kwargs,
-                return_dict=False,
-            )[0]
-
-    @torch.no_grad()
-    def capture_replay(self, latent_model_input, timestep, encoder_hidden_states):
-        inputs = [latent_model_input, timestep, encoder_hidden_states, False]
-        h = self.ht.hpu.graphs.input_hash(inputs)
-        cached = self.cache.get(h)
-
-        if cached is None:
-            # Capture the graph and cache it
-            with self.ht.hpu.stream(self.hpu_stream):
-                graph = self.ht.hpu.HPUGraph()
-                graph.capture_begin()
-                outputs = self.unet(inputs[0], inputs[1], inputs[2], inputs[3])[0]
-                graph.capture_end()
-                graph_inputs = inputs
-                graph_outputs = outputs
-                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
-            return outputs
-
-        # Replay the cached graph with updated inputs
-        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
-        cached.graph.replay()
-        self.ht.core.hpu.default_stream().synchronize()
-
-        return cached.graph_outputs
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
deleted file mode 100644
index 477871eb40..0000000000
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ /dev/null
@@ -1,643 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-from dataclasses import dataclass
-from math import ceil
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines import StableDiffusionUpscalePipeline
-from diffusers.schedulers import DDPMScheduler, KarrasDiffusionSchedulers
-from diffusers.utils import BaseOutput
-from diffusers.utils.torch_utils import randn_tensor
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from optimum.utils import logging
-
-from ....transformers.gaudi_configuration import GaudiConfig
-from ....utils import speed_metrics, warmup_inference_steps_time_adjustment
-from ..pipeline_utils import GaudiDiffusionPipeline
-
-
-logger = logging.get_logger(__name__)
-
-PipelineImageInput = Union[
-    PIL.Image.Image, np.ndarray, torch.FloatTensor, List[PIL.Image.Image], List[np.ndarray], List[torch.FloatTensor]
-]
-
-
-@dataclass
-class GaudiStableDiffusionPipelineOutput(BaseOutput):
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
-    throughput: float
-
-
-class GaudiStableDiffusionUpscalePipeline(GaudiDiffusionPipeline, StableDiffusionUpscalePipeline):
-    """
-    Pipeline for text-guided image super-resolution using Stable Diffusion 2.
-
-    Adapted from: https://github.com/huggingface/diffusers/blob/v0.23.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py#L70
-    - Generation is performed by batches
-    - Two `mark_step()` were added to add support for lazy mode
-    - Added support for HPU graphs
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        low_res_scheduler ([`SchedulerMixin`]):
-            A scheduler used to add initial noise to the low resolution conditioning image. It must be an instance of
-            [`DDPMScheduler`].
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-        use_habana (bool, defaults to `False`):
-            Whether to use Gaudi (`True`) or CPU (`False`).
-        use_hpu_graphs (bool, defaults to `False`):
-            Whether to use HPU graphs or not.
-        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
-            Gaudi configuration to use. Can be a string to download it from the Hub.
-            Or a previously initialized config can be passed.
-        bf16_full_eval (bool, defaults to `False`):
-            Whether to use full bfloat16 evaluation instead of 32-bit.
-            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        low_res_scheduler: DDPMScheduler,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: Optional[Any] = None,
-        feature_extractor: Optional[CLIPImageProcessor] = None,
-        watermarker: Optional[Any] = None,
-        max_noise_level: int = 350,
-        use_habana: bool = False,
-        use_hpu_graphs: bool = False,
-        gaudi_config: Union[str, GaudiConfig] = None,
-        bf16_full_eval: bool = False,
-    ):
-        GaudiDiffusionPipeline.__init__(self, use_habana, use_hpu_graphs, gaudi_config, bf16_full_eval)
-
-        # Workaround for Synapse 1.11 for full bf16
-        if bf16_full_eval:
-            unet.conv_in.float()
-
-        StableDiffusionUpscalePipeline.__init__(
-            self,
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            low_res_scheduler,
-            scheduler,
-            safety_checker,
-            feature_extractor,
-            watermarker,
-            max_noise_level,
-        )
-
-        self.to(self._device)
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height, width)
-        if latents is None:
-            # torch.randn is broken on HPU so running it on CPU
-            rand_device = "cpu" if device.type == "hpu" else device
-            if isinstance(generator, list):
-                shape = (1,) + shape[1:]
-                latents = [
-                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
-                    for i in range(batch_size)
-                ]
-                latents = torch.cat(latents, dim=0).to(device)
-            else:
-                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @classmethod
-    def _split_inputs_into_batches(cls, batch_size, latents, text_embeddings, uncond_embeddings, image, noise_level):
-        # Use torch.split to generate num_batches batches of size batch_size
-        latents_batches = list(torch.split(latents, batch_size))
-        text_embeddings_batches = list(torch.split(text_embeddings, batch_size))
-        image_batches = list(torch.split(image, batch_size))
-        noise_level_batches = list(torch.split(noise_level.view(-1, 1), batch_size))
-        if uncond_embeddings is not None:
-            uncond_embeddings_batches = list(torch.split(uncond_embeddings, batch_size))
-
-        # If the last batch has less samples than batch_size, pad it with dummy samples
-        num_dummy_samples = 0
-        if latents_batches[-1].shape[0] < batch_size:
-            num_dummy_samples = batch_size - latents_batches[-1].shape[0]
-            # Pad latents_batches
-            sequence_to_stack = (latents_batches[-1],) + tuple(
-                torch.zeros_like(latents_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-            )
-            latents_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad image_batches
-            sequence_to_stack = (image_batches[-1],) + tuple(
-                torch.zeros_like(image_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-            )
-            image_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad noise_level_batches
-            sequence_to_stack = (noise_level_batches[-1],) + tuple(
-                torch.zeros_like(noise_level_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-            )
-            noise_level_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad text_embeddings_batches
-            sequence_to_stack = (text_embeddings_batches[-1],) + tuple(
-                torch.zeros_like(text_embeddings_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-            )
-            text_embeddings_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad uncond_embeddings_batches if necessary
-            if uncond_embeddings is not None:
-                sequence_to_stack = (uncond_embeddings_batches[-1],) + tuple(
-                    torch.zeros_like(uncond_embeddings_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-                )
-                uncond_embeddings_batches[-1] = torch.vstack(sequence_to_stack)
-
-        # Stack batches in the same tensor
-        latents_batches = torch.stack(latents_batches)
-        if uncond_embeddings is not None:
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            for i, (uncond_embeddings_batch, text_embeddings_batch) in enumerate(
-                zip(uncond_embeddings_batches, text_embeddings_batches[:])
-            ):
-                text_embeddings_batches[i] = torch.cat([uncond_embeddings_batch, text_embeddings_batch])
-        text_embeddings_batches = torch.stack(text_embeddings_batches)
-        image_batches = torch.stack(image_batches)
-        noise_level_batches = torch.stack(noise_level_batches).squeeze(-1)
-
-        return latents_batches, text_embeddings_batches, image_batches, noise_level_batches, num_dummy_samples
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: PipelineImageInput = None,
-        num_inference_steps: int = 75,
-        guidance_scale: float = 9.0,
-        noise_level: int = 20,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        batch_size: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: int = None,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image` or tensor representing an image batch to be upscaled.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images in a batch.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated randomly.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-
-        Returns:
-            [`~diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipelineOutput`] or `tuple`:
-            [`~diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple`.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-
-        Examples:
-        ```py
-        >>> import requests   #TODO to test?
-        >>> from PIL import Image
-        >>> from io import BytesIO
-        >>> from optimum.habana.diffusers import GaudiStableDiffusionUpscalePipeline
-        >>> import torch
-
-        >>> # load model and scheduler
-        >>> model_id = "stabilityai/stable-diffusion-x4-upscaler"
-        >>> pipeline = GaudiStableDiffusionUpscalePipeline.from_pretrained(
-        ...     model_id, revision="fp16", torch_dtype=torch.bfloat16
-        ... )
-        >>> pipeline = pipeline.to("cuda")
-
-        >>> # let's download an  image
-        >>> url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
-        >>> response = requests.get(url)
-        >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> low_res_img = low_res_img.resize((128, 128))
-        >>> prompt = "a white cat"
-
-        >>> upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
-        >>> upscaled_image.save("upsampled_cat.png")
-        ```
-        """
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
-            # 0. Check inputs. Raise error if not correct
-            self.check_inputs(
-                prompt, image, noise_level, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-            )
-            if image is None:
-                raise ValueError("`image` input cannot be undefined.")
-
-            # 1. Define call parameters
-            if prompt is not None and isinstance(prompt, str):
-                num_prompts = 1
-            elif prompt is not None and isinstance(prompt, list):
-                num_prompts = len(prompt)
-            else:
-                num_prompts = prompt_embeds.shape[0]
-            num_batches = ceil((num_images_per_prompt * num_prompts) / batch_size)
-            logger.info(
-                f"{num_prompts} prompt(s) received, {num_images_per_prompt} generation(s) per prompt,"
-                f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
-            )
-            if num_batches < 3:
-                logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
-            device = self._execution_device
-            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-            # corresponds to doing no classifier free guidance.
-            do_classifier_free_guidance = guidance_scale > 1.0
-
-            # 2. Encode input prompt
-            text_encoder_lora_scale = (
-                cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-            )
-            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-                prompt,
-                device,
-                num_images_per_prompt,
-                do_classifier_free_guidance,
-                negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                lora_scale=text_encoder_lora_scale,
-                clip_skip=clip_skip,
-            )
-
-            # 3. Preprocess image
-            image = self.image_processor.preprocess(image)
-            image = image.to(dtype=prompt_embeds.dtype, device=device)
-
-            # 4. Prepare timesteps
-            self.scheduler.set_timesteps(num_inference_steps, device="cpu")
-            timesteps = self.scheduler.timesteps.to(device)
-            self.scheduler.reset_timestep_dependent_params()
-
-            # 5. Add noise to image
-            noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
-            noise = randn_tensor(image.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
-            image = self.low_res_scheduler.add_noise(image, noise, noise_level)
-
-            image = torch.cat([image] * num_images_per_prompt)
-            noise_level = torch.cat([noise_level] * image.shape[0])
-
-            # 6. Prepare latent variables
-            height, width = image.shape[2:]
-            num_channels_latents = self.vae.config.latent_channels
-            latents = self.prepare_latents(
-                num_prompts * num_images_per_prompt,
-                num_channels_latents,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-            )
-
-            # 7. Check that sizes of image and latents match
-            num_channels_image = image.shape[1]
-            if num_channels_latents + num_channels_image != self.unet.config.in_channels:
-                raise ValueError(
-                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                    f" `num_channels_image`: {num_channels_image} "
-                    f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                    " `pipeline.unet` or your `image` input."
-                )
-
-            # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-            # 9. Split into batches (HPU-specific step)
-            (
-                latents_batches,
-                text_embeddings_batches,
-                image_batches,
-                noise_level_batches,
-                num_dummy_samples,
-            ) = self._split_inputs_into_batches(
-                batch_size, latents, prompt_embeds, negative_prompt_embeds, image, noise_level
-            )
-
-            outputs = {"images": [], "has_nsfw_concept": []}
-            t0 = time.time()
-            t1 = t0
-
-            # 10. Denoising loop
-            throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
-            use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
-            )
-
-            for j in self.progress_bar(range(num_batches)):
-                # The throughput is calculated from the 3rd iteration
-                # because compilation occurs in the first two iterations
-                if j == throughput_warmup_steps:
-                    t1 = time.time()
-                if use_warmup_inference_steps:
-                    t0_inf = time.time()
-
-                latents_batch = latents_batches[0]
-                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
-                text_embeddings_batch = text_embeddings_batches[0]
-                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
-                image_batch = image_batches[0]
-                image_batches = torch.roll(image_batches, shifts=-1, dims=0)
-                noise_level_batch = noise_level_batches[0]
-                noise_level_batches = torch.roll(noise_level_batches, shifts=-1, dims=0)
-
-                for i in range(len(timesteps)):
-                    if use_warmup_inference_steps and i == throughput_warmup_steps:
-                        t1_inf = time.time()
-                        t1 += t1_inf - t0_inf
-
-                    timestep = timesteps[0]
-                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        torch.cat([latents_batch] * 2) if do_classifier_free_guidance else latents_batch
-                    )
-                    # latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep) #TODO why this has been removed?
-                    image_input = torch.cat([image_batch] * 2) if do_classifier_free_guidance else image_batch
-                    noise_level_input = (
-                        torch.cat([noise_level_batch] * 2) if do_classifier_free_guidance else noise_level_batch
-                    )
-                    latent_model_input = torch.cat([latent_model_input, image_input], dim=1)
-
-                    # predict the noise residual
-                    noise_pred = self.unet_hpu(
-                        latent_model_input,
-                        timestep,
-                        text_embeddings_batch,
-                        cross_attention_kwargs,
-                        class_labels=noise_level_input,
-                    )
-
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents_batch = self.scheduler.step(
-                        noise_pred, timestep, latents_batch, **extra_step_kwargs, return_dict=False
-                    )[0]
-
-                    if not self.use_hpu_graphs:
-                        self.htcore.mark_step()
-
-                    # call the callback, if provided
-                    if callback is not None and i % callback_steps == 0:
-                        step_idx = i // getattr(self.scheduler, "order", 1)
-                        callback(step_idx, timestep, latents_batch)
-
-                if use_warmup_inference_steps:
-                    t1 = warmup_inference_steps_time_adjustment(
-                        t1, t1_inf, num_inference_steps, throughput_warmup_steps
-                    )
-
-                if not output_type == "latent":
-                    # 8. Post-processing
-                    # make sure the VAE is in float32 mode, as it overflows in bfloat16
-                    needs_upcasting = self.vae.dtype == torch.bfloat16 and self.vae.config.force_upcast
-
-                    if needs_upcasting:
-                        self.upcast_vae()
-
-                    # Ensure latents are always the same type as the VAE
-                    latents_batch = latents_batch.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
-                    image = self.vae.decode(latents_batch / self.vae.config.scaling_factor, return_dict=False)[0]
-
-                    # cast back to fp16 if needed
-                    if needs_upcasting:
-                        self.vae.to(dtype=torch.bfloat16)
-
-                    image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-                else:
-                    image = latents_batch
-                outputs["images"].append(image)
-
-                if not self.use_hpu_graphs:
-                    self.htcore.mark_step()
-
-            speed_metrics_prefix = "generation"
-            speed_measures = speed_metrics(
-                split=speed_metrics_prefix,
-                start_time=t0,
-                num_samples=num_batches * batch_size
-                if t1 == t0 or use_warmup_inference_steps
-                else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
-                start_time_after_warmup=t1,
-            )
-            logger.info(f"Speed metrics: {speed_measures}")
-
-            # Remove dummy generations if needed
-            if num_dummy_samples > 0:
-                outputs["images"][-1] = outputs["images"][-1][:-num_dummy_samples]
-
-            # Process generated images
-            for i, image in enumerate(outputs["images"][:]):
-                if i == 0:
-                    outputs["images"].clear()
-
-                if output_type == "latent":
-                    has_nsfw_concept = None
-                else:
-                    image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-                if has_nsfw_concept is None:
-                    do_denormalize = [True] * image.shape[0]
-                else:
-                    do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-                image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-                if output_type == "pil" and isinstance(image, list):
-                    # Apply watermark
-                    if self.watermarker is not None:
-                        image = self.watermarker.apply_watermark(image)
-                    outputs["images"] += image
-                elif output_type in ["np", "numpy"] and isinstance(image, np.ndarray):
-                    if len(outputs["images"]) == 0:
-                        outputs["images"] = image
-                    else:
-                        outputs["images"] = np.concatenate((outputs["images"], image), axis=0)
-                else:
-                    if len(outputs["images"]) == 0:
-                        outputs["images"] = image
-                    else:
-                        outputs["images"] = torch.cat((outputs["images"], image), 0)
-
-                if has_nsfw_concept is not None:
-                    outputs["has_nsfw_concept"] += has_nsfw_concept
-                else:
-                    outputs["has_nsfw_concept"] = None
-
-            if not return_dict:
-                return (outputs["images"], outputs["has_nsfw_concept"])
-
-            return GaudiStableDiffusionPipelineOutput(
-                images=outputs["images"],
-                nsfw_content_detected=outputs["has_nsfw_concept"],
-                throughput=speed_measures[f"{speed_metrics_prefix}_samples_per_second"],
-            )
-
-    @torch.no_grad()
-    def unet_hpu(self, latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs, class_labels):
-        if self.use_hpu_graphs:
-            return self.capture_replay(latent_model_input, timestep, encoder_hidden_states, class_labels)
-        else:
-            return self.unet(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs,
-                return_dict=False,
-                class_labels=class_labels,
-            )[0]
-
-    @torch.no_grad()
-    def capture_replay(self, latent_model_input, timestep, encoder_hidden_states, class_labels):
-        inputs = [latent_model_input, timestep, encoder_hidden_states, False, class_labels]
-        h = self.ht.hpu.graphs.input_hash(inputs)
-        cached = self.cache.get(h)
-
-        if cached is None:
-            # Capture the graph and cache it
-            with self.ht.hpu.stream(self.hpu_stream):
-                graph = self.ht.hpu.HPUGraph()
-                graph.capture_begin()
-                outputs = self.unet(
-                    inputs[0],
-                    timestep=inputs[1],
-                    encoder_hidden_states=inputs[2],
-                    return_dict=inputs[3],
-                    class_labels=inputs[4],
-                )[0]
-                graph.capture_end()
-                graph_inputs = inputs
-                graph_outputs = outputs
-                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
-            return outputs
-
-        # Replay the cached graph with updated inputs
-        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
-        cached.graph.replay()
-        self.ht.core.hpu.default_stream().synchronize()
-
-        return cached.graph_outputs
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
deleted file mode 100644
index 589cb18b05..0000000000
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ /dev/null
@@ -1,1563 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import time
-from dataclasses import dataclass
-from math import ceil
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import habana_frameworks.torch.core as htcore
-import numpy as np
-import PIL
-import torch
-from diffusers import StableDiffusionXLPipeline
-from diffusers.image_processor import PipelineImageInput
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
-    StableDiffusionXLPipelineOutput,
-    retrieve_timesteps,
-)
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import BaseOutput, deprecate, replace_example_docstring
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)
-
-from optimum.habana.diffusers import GaudiDiffusionPipeline
-from optimum.habana.diffusers.models.attention_processor import (
-    AttentionProcessor,
-    AttnProcessor2_0,
-    ScaledDotProductAttention,
-)
-from optimum.habana.diffusers.models.unet_2d_condition import gaudi_unet_2d_condition_model_forward
-from optimum.habana.diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
-    retrieve_timesteps as retrieve_timesteps_v2,
-)
-from optimum.habana.transformers.gaudi_configuration import GaudiConfig
-from optimum.habana.utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
-from optimum.utils import logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import StableDiffusionXLPipeline
-
-        >>> pipe = StableDiffusionXLPipeline.from_pretrained(
-        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-        ... )
-        >>> pipe = pipe.to("cuda")
-
-        >>> prompt = "a photo of an astronaut riding a horse on mars"
-        >>> image = pipe(prompt).images[0]
-        ```
-"""
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
-    """
-    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
-    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
-    """
-    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
-    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
-    # rescale the results from guidance (fixes overexposure)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
-    return noise_cfg
-
-# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-def set_attn_processor_hpu(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-    r"""
-    Sets the attention processor to use to compute attention.
-    Parameters:
-        processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-            The instantiated processor class or a dictionary of processor classes that will be set as the processor
-            for **all** `Attention` layers.
-
-            If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-            processor. This is strongly recommended when setting trainable attention processors.
-
-    """
-    count = len(self.attn_processors.keys())
-
-    if isinstance(processor, dict) and len(processor) != count:
-        raise ValueError(
-            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-        )
-
-    def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if os.environ.get('PATCH_SDPA') is not None:
-                    setattr(module, 'attention_module', ScaledDotProductAttention())
-                    module.set_processor(processor(module.attention_module))
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-    for name, module in self.named_children():
-        fn_recursive_attn_processor(name, module, processor)
-
-# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-def set_default_attn_processor_hpu(self):
-    """
-    Disables custom attention processors and sets the default attention implementation from HPU.
-    """
-    processor = AttnProcessor2_0
-    set_attn_processor_hpu(self, processor)
-
-
-class StableDiffusionXLPipeline_HPU(StableDiffusionXLPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion XL.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *LoRA*: [`StableDiffusionXLPipeline.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
-
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion XL uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        text_encoder_2 ([` CLIPTextModelWithProjection`]):
-            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
-            specifically the
-            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
-            variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        tokenizer_2 (`CLIPTokenizer`):
-            Second Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
-            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
-            `stabilityai/stable-diffusion-xl-base-1-0`.
-        add_watermarker (`bool`, *optional*):
-            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
-            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
-            watermarker will be used.
-    """
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        text_encoder_2: CLIPTextModelWithProjection,
-        tokenizer: CLIPTokenizer,
-        tokenizer_2: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        feature_extractor: CLIPImageProcessor = None,
-        force_zeros_for_empty_prompt: bool = True,
-        add_watermarker: Optional[bool] = None,
-    ):
-        super().__init__(vae,text_encoder,text_encoder_2,tokenizer, tokenizer_2,unet,scheduler,image_encoder,feature_extractor,force_zeros_for_empty_prompt,add_watermarker)
-        self.unet.set_default_attn_processor= set_default_attn_processor_hpu
-        self.unet.forward = gaudi_unet_2d_condition_model_forward
-
-    def run_unet(self, unet, latents, timesteps, t, i, add_text_embeds, add_time_ids, prompt_embeds, extra_step_kwargs,
-            negative_prompt_embeds, negative_add_time_ids, negative_pooled_prompt_embeds,
-            num_warmup_steps, progress_bar, callback,callback_steps, ip_adapter_image,
-            image_embeds, timestep_cond, callback_on_step_end, callback_on_step_end_tensor_inputs):
-        # expand the latents if we are doing classifier free guidance
-        latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-
-        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-        # predict the noise residual
-        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
-        if ip_adapter_image is not None:
-            added_cond_kwargs["image_embeds"] = image_embeds
-        noise_pred = unet(unet,
-            sample=latent_model_input,
-            timestep=t,
-            encoder_hidden_states=prompt_embeds,
-            timestep_cond=timestep_cond,
-            cross_attention_kwargs=self.cross_attention_kwargs,
-            added_cond_kwargs=added_cond_kwargs,
-            return_dict=False,
-        )[0]
-        htcore.mark_step()
-
-        # perform guidance
-        if self.do_classifier_free_guidance:
-            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-        if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
-            # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-            noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
-
-        # compute the previous noisy sample x_t -> x_t-1
-        latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-        if callback_on_step_end is not None:
-            callback_kwargs = {}
-            for k in callback_on_step_end_tensor_inputs:
-                callback_kwargs[k] = locals()[k]
-            callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-            latents = callback_outputs.pop("latents", latents)
-            prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-            negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-            add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
-            negative_pooled_prompt_embeds = callback_outputs.pop(
-                "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
-            )
-            add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
-            negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
-
-        # call the callback, if provided
-        if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-            progress_bar.update()
-            if callback is not None and i % callback_steps == 0:
-                step_idx = i // getattr(self.scheduler, "order", 1)
-                callback(step_idx, t, latents)
-
-        return latents
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        prompt_2: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        denoising_end: Optional[float] = None,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt_2: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-        negative_original_size: Optional[Tuple[int, int]] = None,
-        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
-        negative_target_size: Optional[Tuple[int, int]] = None,
-        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in both text-encoders
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image. This is set to 1024 by default for the best results.
-                Anything below 512 pixels won't work well for
-                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-                and checkpoints that are not specifically fine-tuned on low resolutions.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image. This is set to 1024 by default for the best results.
-                Anything below 512 pixels won't work well for
-                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-                and checkpoints that are not specifically fine-tuned on low resolutions.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            denoising_end (`float`, *optional*):
-                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
-                completed before it is intentionally prematurely terminated. As a result, the returned sample will
-                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
-                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
-                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
-                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
-            guidance_scale (`float`, *optional*, defaults to 5.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            negative_prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
-                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
-                of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            guidance_rescale (`float`, *optional*, defaults to 0.0):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
-                explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
-                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
-                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                For most cases, `target_size` should be set to the desired height and width of the generated image. If
-                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
-                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a target image resolution. It should be as same
-                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
-
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
-            )
-
-        # 0. Default height and width to unet
-        height = height or self.default_sample_size * self.vae_scale_factor
-        width = width or self.default_sample_size * self.vae_scale_factor
-
-        original_size = original_size or (height, width)
-        target_size = target_size or (height, width)
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            prompt_2,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            negative_prompt_2,
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-            callback_on_step_end_tensor_inputs,
-        )
-
-        self._guidance_scale = guidance_scale
-        self._guidance_rescale = guidance_rescale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._denoising_end = denoising_end
-        self._interrupt = False
-        image_embeds = None
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-
-        # 3. Encode input prompt
-        lora_scale = (
-            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
-        )
-
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            prompt_2=prompt_2,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            negative_prompt_2=negative_prompt_2,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-            lora_scale=lora_scale,
-            clip_skip=self.clip_skip,
-        )
-
-        # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Prepare added time ids & embeddings
-        add_text_embeds = pooled_prompt_embeds
-        if self.text_encoder_2 is None:
-            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
-        else:
-            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
-
-        add_time_ids = self._get_add_time_ids(
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            dtype=prompt_embeds.dtype,
-            text_encoder_projection_dim=text_encoder_projection_dim,
-        )
-        if negative_original_size is not None and negative_target_size is not None:
-            negative_add_time_ids = self._get_add_time_ids(
-                negative_original_size,
-                negative_crops_coords_top_left,
-                negative_target_size,
-                dtype=prompt_embeds.dtype,
-                text_encoder_projection_dim=text_encoder_projection_dim,
-            )
-        else:
-            negative_add_time_ids = add_time_ids
-
-        if self.do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
-            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
-
-        prompt_embeds = prompt_embeds.to(device)
-        add_text_embeds = add_text_embeds.to(device)
-        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
-
-        if ip_adapter_image is not None:
-            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image, device, batch_size * num_images_per_prompt
-            )
-
-        # 8. Denoising loop
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-
-        # 8.1 Apply denoising_end
-        if (
-            self.denoising_end is not None
-            and isinstance(self.denoising_end, float)
-            and self.denoising_end > 0
-            and self.denoising_end < 1
-        ):
-            discrete_timestep_cutoff = int(
-                round(
-                    self.scheduler.config.num_train_timesteps
-                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
-                )
-            )
-            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
-            timesteps = timesteps[:num_inference_steps]
-
-        # 9. Optionally get Guidance Scale Embedding
-        timestep_cond = None
-        if self.unet.config.time_cond_proj_dim is not None:
-            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
-            timestep_cond = self.get_guidance_scale_embedding(
-                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-            ).to(device=device, dtype=latents.dtype)
-
-        self._num_timesteps = len(timesteps)
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            timesteps = [t.item() for t in timesteps]
-            if self.quantized:
-                for i, t in enumerate(timesteps[0:-2]):
-                    if self.interrupt:
-                        continue
-                    latents = self.run_unet(self.unet, latents, timesteps,
-                                            t ,i ,
-                                            add_text_embeds ,add_time_ids ,prompt_embeds, extra_step_kwargs,
-                                            negative_prompt_embeds, negative_add_time_ids, negative_pooled_prompt_embeds,
-                                            num_warmup_steps, progress_bar, callback,
-                                            callback_steps, ip_adapter_image,
-                                            image_embeds, timestep_cond, callback_on_step_end,
-                                            callback_on_step_end_tensor_inputs)
-                for i, t in enumerate(timesteps[-2:], 18):
-                    if self.interrupt:
-                        continue
-                    latents = self.run_unet(self.unet_bf16, latents, timesteps,
-                                            t ,i ,
-                                            add_text_embeds ,add_time_ids ,prompt_embeds, extra_step_kwargs,
-                                            negative_prompt_embeds, negative_add_time_ids, negative_pooled_prompt_embeds,
-                                            num_warmup_steps, progress_bar, callback,
-                                            callback_steps, ip_adapter_image,
-                                            image_embeds, timestep_cond, callback_on_step_end,
-                                            callback_on_step_end_tensor_inputs)
-            else:
-                for i, t in enumerate(timesteps):
-                    if self.interrupt:
-                        continue
-                    latents = self.run_unet(self.unet, latents, timesteps,
-                                            t ,i ,
-                                            add_text_embeds ,add_time_ids ,prompt_embeds, extra_step_kwargs,
-                                            negative_prompt_embeds, negative_add_time_ids, negative_pooled_prompt_embeds,
-                                            num_warmup_steps, progress_bar, callback,
-                                            callback_steps, ip_adapter_image,
-                                            image_embeds, timestep_cond, callback_on_step_end,
-                                            callback_on_step_end_tensor_inputs)
-        if not output_type == "latent":
-            # make sure the VAE is in float32 mode, as it overflows in float16
-            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
-
-            if needs_upcasting:
-                self.upcast_vae()
-                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
-
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-
-            # cast back to fp16 if needed
-            if needs_upcasting:
-                self.vae.to(dtype=torch.float16)
-        else:
-            image = latents
-
-        if not output_type == "latent":
-            # apply watermark if available
-            if self.watermark is not None:
-                image = self.watermark.apply_watermark(image)
-
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (image,)
-
-        return StableDiffusionXLPipelineOutput(images=image)
-
-
-@dataclass
-class GaudiStableDiffusionXLPipelineOutput(BaseOutput):
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    throughput: float
-
-
-class GaudiStableDiffusionXLPipeline(GaudiDiffusionPipeline, StableDiffusionXLPipeline):
-    """
-    Pipeline for text-to-image generation using Stable Diffusion XL on Gaudi devices
-    Adapted from: https://github.com/huggingface/diffusers/blob/v0.23.1/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L96
-
-    Extends the [`StableDiffusionXLPipeline`](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline) class:
-        - Generation is performed by batches
-        - Two `mark_step()` were added to add support for lazy mode
-        - Added support for HPU graphs
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion XL uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        text_encoder_2 ([` CLIPTextModelWithProjection`]):
-            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
-            specifically the
-            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
-            variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        tokenizer_2 (`CLIPTokenizer`):
-            Second Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
-            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
-            `stabilityai/stable-diffusion-xl-base-1-0`.
-        use_habana (bool, defaults to `False`):
-            Whether to use Gaudi (`True`) or CPU (`False`).
-        use_hpu_graphs (bool, defaults to `False`):
-            Whether to use HPU graphs or not.
-        gaudi_config (Union[str, [`GaudiConfig`]], defaults to `None`):
-            Gaudi configuration to use. Can be a string to download it from the Hub.
-           Or a previously initialized config can be passed.
-        bf16_full_eval (bool, defaults to `False`):
-            Whether to use full bfloat16 evaluation instead of 32-bit.
-            This will be faster and save memory compared to fp32/mixed precision but can harm generated images.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        text_encoder_2: CLIPTextModelWithProjection,
-        tokenizer: CLIPTokenizer,
-        tokenizer_2: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        feature_extractor: CLIPImageProcessor = None,
-        force_zeros_for_empty_prompt: bool = True,
-        use_habana: bool = False,
-        use_hpu_graphs: bool = False,
-        gaudi_config: Union[str, GaudiConfig] = None,
-        bf16_full_eval: bool = False,
-    ):
-        GaudiDiffusionPipeline.__init__(
-            self,
-            use_habana,
-            use_hpu_graphs,
-            gaudi_config,
-            bf16_full_eval,
-        )
-
-        StableDiffusionXLPipeline.__init__(
-            self,
-            vae,
-            text_encoder,
-            text_encoder_2,
-            tokenizer,
-            tokenizer_2,
-            unet,
-            scheduler,
-            image_encoder,
-            feature_extractor,
-            force_zeros_for_empty_prompt,
-        )
-
-        self.to(self._device)
-
-    def prepare_latents(self, num_images, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (num_images, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != num_images:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {num_images}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            # torch.randn is broken on HPU so running it on CPU
-            rand_device = "cpu" if device.type == "hpu" else device
-            if isinstance(generator, list):
-                shape = (1,) + shape[1:]
-                latents = [
-                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
-                    for i in range(num_images)
-                ]
-                latents = torch.cat(latents, dim=0).to(device)
-            else:
-                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @classmethod
-    def _split_inputs_into_batches(
-        cls,
-        batch_size,
-        latents,
-        prompt_embeds,
-        negative_prompt_embeds,
-        add_text_embeds,
-        negative_pooled_prompt_embeds,
-        add_time_ids,
-        negative_add_time_ids,
-    ):
-        # Use torch.split to generate num_batches batches of size batch_size
-        latents_batches = list(torch.split(latents, batch_size))
-        prompt_embeds_batches = list(torch.split(prompt_embeds, batch_size))
-        if negative_prompt_embeds is not None:
-            negative_prompt_embeds_batches = list(torch.split(negative_prompt_embeds, batch_size))
-        if add_text_embeds is not None:
-            add_text_embeds_batches = list(torch.split(add_text_embeds, batch_size))
-        if negative_pooled_prompt_embeds is not None:
-            negative_pooled_prompt_embeds_batches = list(torch.split(negative_pooled_prompt_embeds, batch_size))
-        if add_time_ids is not None:
-            add_time_ids_batches = list(torch.split(add_time_ids, batch_size))
-        if negative_add_time_ids is not None:
-            negative_add_time_ids_batches = list(torch.split(negative_add_time_ids, batch_size))
-
-        # If the last batch has less samples than batch_size, pad it with dummy samples
-        num_dummy_samples = 0
-        if latents_batches[-1].shape[0] < batch_size:
-            num_dummy_samples = batch_size - latents_batches[-1].shape[0]
-            # Pad latents_batches
-            sequence_to_stack = (latents_batches[-1],) + tuple(
-                torch.zeros_like(latents_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-            )
-            latents_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad prompt_embeds_batches
-            sequence_to_stack = (prompt_embeds_batches[-1],) + tuple(
-                torch.zeros_like(prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-            )
-            prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad negative_prompt_embeds_batches if necessary
-            if negative_prompt_embeds is not None:
-                sequence_to_stack = (negative_prompt_embeds_batches[-1],) + tuple(
-                    torch.zeros_like(negative_prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-                )
-                negative_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad add_text_embeds_batches if necessary
-            if add_text_embeds is not None:
-                sequence_to_stack = (add_text_embeds_batches[-1],) + tuple(
-                    torch.zeros_like(add_text_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-                )
-                add_text_embeds_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad negative_pooled_prompt_embeds_batches if necessary
-            if negative_pooled_prompt_embeds is not None:
-                sequence_to_stack = (negative_pooled_prompt_embeds_batches[-1],) + tuple(
-                    torch.zeros_like(negative_pooled_prompt_embeds_batches[-1][0][None, :])
-                    for _ in range(num_dummy_samples)
-                )
-                negative_pooled_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad add_time_ids_batches if necessary
-            if add_time_ids is not None:
-                sequence_to_stack = (add_time_ids_batches[-1],) + tuple(
-                    torch.zeros_like(add_time_ids_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-                )
-                add_time_ids_batches[-1] = torch.vstack(sequence_to_stack)
-            # Pad negative_add_time_ids_batches if necessary
-            if negative_add_time_ids is not None:
-                sequence_to_stack = (negative_add_time_ids_batches[-1],) + tuple(
-                    torch.zeros_like(negative_add_time_ids_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-                )
-                negative_add_time_ids_batches[-1] = torch.vstack(sequence_to_stack)
-
-        # Stack batches in the same tensor
-        latents_batches = torch.stack(latents_batches)
-
-        if negative_prompt_embeds is not None:
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            for i, (negative_prompt_embeds_batch, prompt_embeds_batch) in enumerate(
-                zip(negative_prompt_embeds_batches, prompt_embeds_batches[:])
-            ):
-                prompt_embeds_batches[i] = torch.cat([negative_prompt_embeds_batch, prompt_embeds_batch])
-        prompt_embeds_batches = torch.stack(prompt_embeds_batches)
-
-        if add_text_embeds is not None:
-            if negative_pooled_prompt_embeds is not None:
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                for i, (negative_pooled_prompt_embeds_batch, add_text_embeds_batch) in enumerate(
-                    zip(negative_pooled_prompt_embeds_batches, add_text_embeds_batches[:])
-                ):
-                    add_text_embeds_batches[i] = torch.cat(
-                        [negative_pooled_prompt_embeds_batch, add_text_embeds_batch]
-                    )
-            add_text_embeds_batches = torch.stack(add_text_embeds_batches)
-        else:
-            add_text_embeds_batches = None
-
-        if add_time_ids is not None:
-            if negative_add_time_ids is not None:
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                for i, (negative_add_time_ids_batch, add_time_ids_batch) in enumerate(
-                    zip(negative_add_time_ids_batches, add_time_ids_batches[:])
-                ):
-                    add_time_ids_batches[i] = torch.cat([negative_add_time_ids_batch, add_time_ids_batch])
-            add_time_ids_batches = torch.stack(add_time_ids_batches)
-        else:
-            add_time_ids_batches = None
-
-        return latents_batches, prompt_embeds_batches, add_text_embeds_batches, add_time_ids_batches, num_dummy_samples
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        prompt_2: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        denoising_end: Optional[float] = None,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt_2: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        batch_size: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-        negative_original_size: Optional[Tuple[int, int]] = None,
-        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
-        negative_target_size: Optional[Tuple[int, int]] = None,
-        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = [
-            "latents",
-            "prompt_embeds",
-            "negative_prompt_embeds",
-            "add_text_embeds",
-            "add_time_ids",
-            "negative_pooled_prompt_embeds",
-            "negative_add_time_ids",
-        ],
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in both text-encoders
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image. This is set to 1024 by default for the best results.
-                Anything below 512 pixels won't work well for
-                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-                and checkpoints that are not specifically fine-tuned on low resolutions.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image. This is set to 1024 by default for the best results.
-                Anything below 512 pixels won't work well for
-                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-                and checkpoints that are not specifically fine-tuned on low resolutions.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            denoising_end (`float`, *optional*):
-                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
-                completed before it is intentionally prematurely terminated. As a result, the returned sample will
-                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
-                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
-                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
-                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
-            guidance_scale (`float`, *optional*, defaults to 5.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            negative_prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
-                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images in a batch.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                #Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
-                Whether or not to return a [`~diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.GaudiStableDiffusionXLPipelineOutput`] instead
-                of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            guidance_rescale (`float`, *optional*, defaults to 0.0):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
-                explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
-                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
-                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                For most cases, `target_size` should be set to the desired height and width of the generated image. If
-                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
-                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a target image resolution. It should be as same
-                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            profiling_warmup_steps (`int`, *optional*):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*):
-                Number of steps to be captured when enabling profiling.
-
-        Examples:
-
-        Returns:
-            #[`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
-            #[`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
-            [`~diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.GaudiStableDiffusionXLPipelineOutput`] or `tuple`:
-            [`~diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.GaudiStableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
-
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
-            )
-
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
-            # 0. Default height and width to unet
-            height = height or self.default_sample_size * self.vae_scale_factor
-            width = width or self.default_sample_size * self.vae_scale_factor
-
-            original_size = original_size or (height, width)
-            target_size = target_size or (height, width)
-
-            # 1. Check inputs. Raise error if not correct
-            self.check_inputs(
-                prompt,
-                prompt_2,
-                height,
-                width,
-                callback_steps,
-                negative_prompt,
-                negative_prompt_2,
-                prompt_embeds,
-                negative_prompt_embeds,
-                pooled_prompt_embeds,
-                negative_pooled_prompt_embeds,
-                callback_on_step_end_tensor_inputs,
-            )
-
-            self._guidance_scale = guidance_scale
-            self._guidance_rescale = guidance_rescale
-            self._clip_skip = clip_skip
-            self._cross_attention_kwargs = cross_attention_kwargs
-            self._denoising_end = denoising_end
-            self._interrupt = False
-
-            # 2. Define call parameters
-            if prompt is not None and isinstance(prompt, str):
-                num_prompts = 1
-            elif prompt is not None and isinstance(prompt, list):
-                num_prompts = len(prompt)
-            else:
-                num_prompts = prompt_embeds.shape[0]
-            num_batches = ceil((num_images_per_prompt * num_prompts) / batch_size)
-            logger.info(
-                f"{num_prompts} prompt(s) received, {num_images_per_prompt} generation(s) per prompt,"
-                f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
-            )
-            if num_batches < 3:
-                logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
-
-            device = self._execution_device
-
-            # 3. Encode input prompt
-            lora_scale = (
-                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
-            )
-
-            (
-                prompt_embeds,
-                negative_prompt_embeds,
-                pooled_prompt_embeds,
-                negative_pooled_prompt_embeds,
-            ) = self.encode_prompt(
-                prompt=prompt,
-                prompt_2=prompt_2,
-                device=device,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=self.do_classifier_free_guidance,
-                negative_prompt=negative_prompt,
-                negative_prompt_2=negative_prompt_2,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                pooled_prompt_embeds=pooled_prompt_embeds,
-                negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-                lora_scale=lora_scale,
-                clip_skip=self.clip_skip,
-            )
-
-            # 4. Prepare timesteps
-            timesteps, num_inference_steps = retrieve_timesteps_v2(self.scheduler, num_inference_steps, device, timesteps)
-
-            # 5. Prepare latent variables
-            num_channels_latents = self.unet.config.in_channels
-            latents = self.prepare_latents(
-                num_prompts * num_images_per_prompt,
-                num_channels_latents,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-            )
-
-            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-            # 7. Prepare added time ids & embeddings
-            add_text_embeds = pooled_prompt_embeds
-            if self.text_encoder_2 is None:
-                text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
-            else:
-                text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
-
-            add_time_ids = self._get_add_time_ids(
-                original_size,
-                crops_coords_top_left,
-                target_size,
-                dtype=prompt_embeds.dtype,
-                text_encoder_projection_dim=text_encoder_projection_dim,
-            )
-            if negative_original_size is not None and negative_target_size is not None:
-                negative_add_time_ids = self._get_add_time_ids(
-                    negative_original_size,
-                    negative_crops_coords_top_left,
-                    negative_target_size,
-                    dtype=prompt_embeds.dtype,
-                    text_encoder_projection_dim=text_encoder_projection_dim,
-                )
-            else:
-                negative_add_time_ids = add_time_ids
-
-            prompt_embeds = prompt_embeds.to(device)
-            if negative_prompt_embeds is not None:
-                negative_prompt_embeds = negative_prompt_embeds.to(device)
-            add_text_embeds = add_text_embeds.to(device)
-            if negative_pooled_prompt_embeds is not None:
-                negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.to(device)
-            add_time_ids = add_time_ids.to(device).repeat(num_prompts * num_images_per_prompt, 1)
-            negative_add_time_ids = negative_add_time_ids.to(device).repeat(num_prompts * num_images_per_prompt, 1)
-
-            if ip_adapter_image is not None:
-                image_embeds = self.prepare_ip_adapter_image_embeds(
-                    ip_adapter_image, device, batch_size * num_images_per_prompt
-                )
-
-            # 7.5 Split into batches (HPU-specific step)
-            (
-                latents_batches,
-                text_embeddings_batches,
-                add_text_embeddings_batches,
-                add_time_ids_batches,
-                num_dummy_samples,
-            ) = self._split_inputs_into_batches(
-                batch_size,
-                latents,
-                prompt_embeds,
-                negative_prompt_embeds,
-                add_text_embeds,
-                negative_pooled_prompt_embeds,
-                add_time_ids,
-                negative_add_time_ids,
-            )
-            outputs = {
-                "images": [],
-            }
-            t0 = time.time()
-            t1 = t0
-
-            self._num_timesteps = len(timesteps)
-
-            hb_profiler = HabanaProfile(
-                warmup=profiling_warmup_steps,
-                active=profiling_steps,
-                record_shapes=False,
-            )
-            hb_profiler.start()
-
-            # 8. Denoising
-            num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-
-            # 8.1 Apply denoising_end
-            if (
-                self.denoising_end is not None
-                and isinstance(self.denoising_end, float)
-                and self.denoising_end > 0
-                and self.denoising_end < 1
-            ):
-                discrete_timestep_cutoff = int(
-                    round(
-                        self.scheduler.config.num_train_timesteps
-                        - (self.denoising_end * self.scheduler.config.num_train_timesteps)
-                    )
-                )
-                num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
-                timesteps = timesteps[:num_inference_steps]
-
-            # 8.2 Optionally get Guidance Scale Embedding
-            timestep_cond = None
-            if self.unet.config.time_cond_proj_dim is not None:
-                guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
-                    num_prompts * num_images_per_prompt
-                )
-                timestep_cond = self.get_guidance_scale_embedding(
-                    guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-                ).to(device=device, dtype=latents.dtype)
-
-            self._num_timesteps = len(timesteps)
-
-            # 8.3 Denoising loop
-            throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
-            use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
-            )
-
-            for j in self.progress_bar(range(num_batches)):
-                # The throughput is calculated from the 3rd iteration
-                # because compilation occurs in the first two iterations
-                if j == throughput_warmup_steps:
-                    t1 = time.time()
-                if use_warmup_inference_steps:
-                    t0_inf = time.time()
-
-                latents_batch = latents_batches[0]
-                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
-                text_embeddings_batch = text_embeddings_batches[0]
-                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
-                add_text_embeddings_batch = add_text_embeddings_batches[0]
-                add_text_embeddings_batches = torch.roll(add_text_embeddings_batches, shifts=-1, dims=0)
-                add_time_ids_batch = add_time_ids_batches[0]
-                add_time_ids_batches = torch.roll(add_time_ids_batches, shifts=-1, dims=0)
-
-                for i in range(num_inference_steps):
-                    if use_warmup_inference_steps and i == throughput_warmup_steps:
-                        t1_inf = time.time()
-                        t1 += t1_inf - t0_inf
-
-                    if self.interrupt:
-                        continue
-                    timestep = timesteps[0]
-                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        torch.cat([latents_batch] * 2) if self.do_classifier_free_guidance else latents_batch
-                    )
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
-
-                    # predict the noise residual
-                    added_cond_kwargs = {"text_embeds": add_text_embeddings_batch, "time_ids": add_time_ids_batch}
-                    if ip_adapter_image is not None:
-                        added_cond_kwargs["image_embeds"] = image_embeds
-                    noise_pred = self.unet_hpu(
-                        latent_model_input,
-                        timestep,
-                        text_embeddings_batch,
-                        timestep_cond,
-                        self.cross_attention_kwargs,
-                        added_cond_kwargs,
-                    )
-
-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
-                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                        noise_pred = rescale_noise_cfg(
-                            noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
-                        )
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents_batch = self.scheduler.step(
-                        noise_pred, timestep, latents_batch, **extra_step_kwargs, return_dict=False
-                    )[0]
-
-                    if not self.use_hpu_graphs:
-                        self.htcore.mark_step()
-
-                    if callback_on_step_end is not None:
-                        callback_kwargs = {}
-                        for k in callback_on_step_end_tensor_inputs:
-                            callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, timestep, callback_kwargs)
-
-                        latents_batch = callback_outputs.pop("latents", latents_batch)
-                        _prompt_embeds = callback_outputs.pop("prompt_embeds", None)
-                        _negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", None)
-                        if _prompt_embeds is not None and _negative_prompt_embeds is not None:
-                            text_embeddings_batch = torch.cat([_negative_prompt_embeds, _prompt_embeds])
-                        _add_text_embeds = callback_outputs.pop("add_text_embeds", None)
-                        _negative_pooled_prompt_embeds = callback_outputs.pop("negative_pooled_prompt_embeds", None)
-                        if _add_text_embeds is not None and _negative_pooled_prompt_embeds is not None:
-                            add_text_embeddings_batch = torch.cat([_negative_pooled_prompt_embeds, _add_text_embeds])
-                        _add_time_ids = callback_outputs.pop("add_time_ids", None)
-                        _negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", None)
-                        if _add_time_ids is not None and _negative_add_time_ids is not None:
-                            add_time_ids_batch = torch.cat([_add_time_ids, _negative_add_time_ids])
-
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        if callback is not None and i % callback_steps == 0:
-                            step_idx = i // getattr(self.scheduler, "order", 1)
-                            callback(step_idx, timestep, latents)
-
-                    hb_profiler.step()
-
-                if use_warmup_inference_steps:
-                    t1 = warmup_inference_steps_time_adjustment(
-                        t1, t1_inf, num_inference_steps, throughput_warmup_steps
-                    )
-
-                if not output_type == "latent":
-                    # Post-processing
-                    # To resolve the dtype mismatch issue
-                    image = self.vae.decode(
-                        (latents_batch / self.vae.config.scaling_factor).to(self.vae.encoder.conv_in.weight.dtype),
-                        return_dict=False,
-                    )[0]
-
-                else:
-                    image = latents_batch
-
-                outputs["images"].append(image)
-
-                if not self.use_hpu_graphs:
-                    self.htcore.mark_step()
-
-            hb_profiler.stop()
-
-            speed_metrics_prefix = "generation"
-            speed_measures = speed_metrics(
-                split=speed_metrics_prefix,
-                start_time=t0,
-                num_samples=num_batches * batch_size
-                if t1 == t0 or use_warmup_inference_steps
-                else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
-                start_time_after_warmup=t1,
-            )
-            logger.info(f"Speed metrics: {speed_measures}")
-
-            # Remove dummy generations if needed
-            if num_dummy_samples > 0:
-                outputs["images"][-1] = outputs["images"][-1][:-num_dummy_samples]
-
-            # Process generated images
-            for i, image in enumerate(outputs["images"][:]):
-                if i == 0:
-                    outputs["images"].clear()
-
-                if not output_type == "latent":
-                    # apply watermark if available
-                    if self.watermark is not None:
-                        image = self.watermark.apply_watermark(image)
-
-                image = self.image_processor.postprocess(image, output_type=output_type)
-
-                if output_type == "pil" and isinstance(image, list):
-                    outputs["images"] += image
-                elif output_type in ["np", "numpy"] and isinstance(image, np.ndarray):
-                    if len(outputs["images"]) == 0:
-                        outputs["images"] = image
-                    else:
-                        outputs["images"] = np.concatenate((outputs["images"], image), axis=0)
-                else:
-                    if len(outputs["images"]) == 0:
-                        outputs["images"] = image
-                    else:
-                        outputs["images"] = torch.cat((outputs["images"], image), 0)
-
-            # Offload all models
-            self.maybe_free_model_hooks()
-
-            if not return_dict:
-                return outputs["images"]
-
-            return GaudiStableDiffusionXLPipelineOutput(
-                images=outputs["images"],
-                throughput=speed_measures[f"{speed_metrics_prefix}_samples_per_second"],
-            )
-
-    @torch.no_grad()
-    def unet_hpu(
-        self,
-        latent_model_input,
-        timestep,
-        encoder_hidden_states,
-        timestep_cond,
-        cross_attention_kwargs,
-        added_cond_kwargs,
-    ):
-        if self.use_hpu_graphs:
-            return self.capture_replay(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states,
-                timestep_cond,
-                cross_attention_kwargs,
-                added_cond_kwargs,
-            )
-        else:
-            return self.unet(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states=encoder_hidden_states,
-                timestep_cond=timestep_cond,
-                cross_attention_kwargs=cross_attention_kwargs,
-                added_cond_kwargs=added_cond_kwargs,
-                return_dict=False,
-            )[0]
-
-    @torch.no_grad()
-    def capture_replay(
-        self,
-        latent_model_input,
-        timestep,
-        encoder_hidden_states,
-        timestep_cond,
-        cross_attention_kwargs,
-        added_cond_kwargs,
-    ):
-        inputs = [
-            latent_model_input,
-            timestep,
-            encoder_hidden_states,
-            timestep_cond,
-            cross_attention_kwargs,
-            added_cond_kwargs,
-        ]
-        h = self.ht.hpu.graphs.input_hash(inputs)
-        cached = self.cache.get(h)
-
-        if cached is None:
-            # Capture the graph and cache it
-            with self.ht.hpu.stream(self.hpu_stream):
-                graph = self.ht.hpu.HPUGraph()
-                graph.capture_begin()
-
-                outputs = self.unet(
-                    sample=inputs[0],
-                    timestep=inputs[1],
-                    encoder_hidden_states=inputs[2],
-                    timestep_cond=inputs[3],
-                    cross_attention_kwargs=inputs[4],
-                    added_cond_kwargs=inputs[5],
-                    return_dict=False,
-                )[0]
-
-                graph.capture_end()
-                graph_inputs = inputs
-                graph_outputs = outputs
-                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
-            return outputs
-
-        # Replay the cached graph with updated inputs
-        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
-        cached.graph.replay()
-        self.ht.core.hpu.default_stream().synchronize()
-
-        return cached.graph_outputs
diff --git a/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
deleted file mode 100644
index cee268140e..0000000000
--- a/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ /dev/null
@@ -1,759 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import time
-from dataclasses import dataclass
-from math import ceil
-from typing import Callable, Dict, List, Optional, Union
-
-import numpy as np
-import PIL.Image
-import torch
-from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
-from diffusers.pipelines.stable_video_diffusion import StableVideoDiffusionPipeline
-from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
-    _append_dims,
-    _resize_with_antialiasing,
-    tensor2vid,
-)
-from diffusers.schedulers import EulerDiscreteScheduler
-from diffusers.utils import BaseOutput, logging
-from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
-from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
-
-from ....transformers.gaudi_configuration import GaudiConfig
-from ....utils import speed_metrics
-from ..pipeline_utils import GaudiDiffusionPipeline
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-class GaudiStableVideoDiffusionPipelineOutput(BaseOutput):
-    r"""
-    Output class for zero-shot text-to-video pipeline.
-
-    Args:
-        frames (`[List[PIL.Image.Image]`, `np.ndarray`]):
-            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
-            num_channels)`.
-        throughput (float):
-            Measured samples per second
-    """
-
-    frames: Union[List[PIL.Image.Image], np.ndarray]
-    throughput: float
-
-
-class GaudiStableVideoDiffusionPipeline(GaudiDiffusionPipeline, StableVideoDiffusionPipeline):
-    r"""
-    Adapted from: https://github.com/huggingface/diffusers/blob/v0.24.0/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py#L72
-    - Added generation by batches functionality
-    - Added support for HPU graphs
-
-    Pipeline to generate video from an input image using Stable Video Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
-        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
-        unet ([`UNetSpatioTemporalConditionModel`]):
-            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
-        scheduler ([`EulerDiscreteScheduler`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            A `CLIPImageProcessor` to extract features from generated images.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKLTemporalDecoder,
-        image_encoder: CLIPVisionModelWithProjection,
-        unet: UNetSpatioTemporalConditionModel,
-        scheduler: EulerDiscreteScheduler,
-        feature_extractor: CLIPImageProcessor,
-        use_habana: bool = False,
-        use_hpu_graphs: bool = False,
-        gaudi_config: Union[str, GaudiConfig] = None,
-        bf16_full_eval: bool = False,
-    ):
-        GaudiDiffusionPipeline.__init__(
-            self,
-            use_habana,
-            use_hpu_graphs,
-            gaudi_config,
-            bf16_full_eval,
-        )
-
-        StableVideoDiffusionPipeline.__init__(
-            self,
-            vae,
-            image_encoder,
-            unet,
-            scheduler,
-            feature_extractor,
-        )
-
-        self.to(self._device)
-
-    def _encode_image(self, image, device, num_videos_per_prompt, do_classifier_free_guidance):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.image_processor.pil_to_numpy(image)
-            image = self.image_processor.numpy_to_pt(image)
-
-            # We normalize the image before resizing to match with the original implementation.
-            # Then we unnormalize it after resizing.
-            image = image * 2.0 - 1.0
-            image = _resize_with_antialiasing(image, (224, 224))
-            image = (image + 1.0) / 2.0
-
-            # Normalize the image with for CLIP input
-            image = self.feature_extractor(
-                images=image,
-                do_normalize=True,
-                do_center_crop=False,
-                do_resize=False,
-                do_rescale=False,
-                return_tensors="pt",
-            ).pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        image_embeddings = self.image_encoder(image).image_embeds
-        image_embeddings = image_embeddings.unsqueeze(1)
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_embeddings.shape
-        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
-        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance:
-            negative_image_embeddings = torch.zeros_like(image_embeddings)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
-
-        return image_embeddings
-
-    def _encode_vae_image(
-        self,
-        image: torch.Tensor,
-        device,
-        num_videos_per_prompt,
-        do_classifier_free_guidance,
-    ):
-        image = image.to(device=device)
-        image_latents = self.vae.encode(image).latent_dist.mode()
-
-        if do_classifier_free_guidance:
-            negative_image_latents = torch.zeros_like(image_latents)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            image_latents = torch.cat([negative_image_latents, image_latents])
-
-        # duplicate image_latents for each generation per prompt, using mps friendly method
-        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
-
-        return image_latents
-
-    def _get_add_time_ids(
-        self,
-        fps,
-        motion_bucket_id,
-        noise_aug_strength,
-        dtype,
-        batch_size,
-        num_videos_per_prompt,
-        do_classifier_free_guidance,
-        device,
-    ):
-        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
-
-        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
-        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
-
-        if expected_add_embed_dim != passed_add_embed_dim:
-            raise ValueError(
-                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
-            )
-
-        add_time_ids = torch.tensor([add_time_ids], dtype=dtype).to(device)
-        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
-
-        if do_classifier_free_guidance:
-            add_time_ids = torch.cat([add_time_ids, add_time_ids])
-
-        return add_time_ids
-
-    def decode_latents(self, latents, num_frames, decode_chunk_size=14):
-        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
-        latents = latents.flatten(0, 1)
-
-        latents = 1 / self.vae.config.scaling_factor * latents
-
-        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
-        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
-
-        # decode decode_chunk_size frames at a time to avoid OOM
-        frames = []
-        for i in range(0, latents.shape[0], decode_chunk_size):
-            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
-            decode_kwargs = {}
-            if accepts_num_frames:
-                # we only pass num_frames_in if it's expected
-                decode_kwargs["num_frames"] = num_frames_in
-
-            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
-            frames.append(frame)
-        frames = torch.cat(frames, dim=0)
-
-        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
-        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        frames = frames.float()
-        return frames
-
-    def check_inputs(self, image, height, width):
-        if (
-            not isinstance(image, torch.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}"
-            )
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-    def prepare_latents(
-        self,
-        batch_size,
-        num_frames,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        shape = (
-            batch_size,
-            num_frames,
-            num_channels_latents // 2,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype).to(device)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-    # corresponds to doing no classifier free guidance.
-    @property
-    def do_classifier_free_guidance(self):
-        if isinstance(self.guidance_scale, (int, float)):
-            return self.guidance_scale
-        return self.guidance_scale.max() > 1
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @classmethod
-    def _pad_batches(cls, input_batches, num_dummy_samples):
-        sequence_to_stack = (input_batches[-1],) + tuple(
-            torch.zeros_like(input_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
-        )
-        input_batches[-1] = torch.vstack(sequence_to_stack)
-        return input_batches
-
-    @classmethod
-    def _split_input_into_batches(
-        cls,
-        cond_input,
-        batch_size,
-        num_dummy_samples,
-        uncond_input=None,
-    ):
-        input_batches = list(torch.split(cond_input, batch_size))
-        uncond_input_batches = None
-        if uncond_input is not None:
-            uncond_input_batches = list(torch.split(uncond_input, batch_size))
-
-        if num_dummy_samples > 0:  # Pad inputs
-            input_batches = cls._pad_batches(input_batches, num_dummy_samples)
-            if uncond_input_batches is not None:
-                uncond_input_batches = cls._pad_batches(uncond_input_batches, num_dummy_samples)
-
-        if uncond_input_batches is not None:
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and conditional inputs into a single batch
-            # to avoid doing two forward passes
-            for i, (uncond_input_batch, input_batch) in enumerate(zip(uncond_input_batches, input_batches[:])):
-                input_batches[i] = torch.cat([uncond_input_batch, input_batch])
-        input_batches = torch.stack(input_batches)
-        return input_batches
-
-    @classmethod
-    def _split_image_latents_into_batches(
-        cls,
-        image_latents,
-        batch_size,
-        num_dummy_samples,
-        num_images,
-        do_classifier_free_guidance,
-    ):
-        if do_classifier_free_guidance:
-            # Tiling of unconditional and conditional image latents differs from image embeddings
-            # For image latents, first concatenate the unconditional and conditional image latents
-            # Next, repeat for number of videos per prompt
-            negative_image_latents = torch.zeros_like(image_latents)
-            image_latents_batches = list(torch.split(image_latents, batch_size))
-            negative_image_latents_batches = list(torch.split(negative_image_latents, batch_size))
-            if num_dummy_samples > 0:  # Pad inputs
-                image_latents_batches = cls._pad_batches(image_latents_batches, num_dummy_samples)
-                negative_image_latents_batches = cls._pad_batches(negative_image_latents_batches, num_dummy_samples)
-            for i, (negative_image_latents_batch, image_latents_batch) in enumerate(
-                zip(negative_image_latents_batches, image_latents_batches[:])
-            ):
-                uncond_splits = list(torch.split(negative_image_latents_batch, num_images))
-                cond_splits = list(torch.split(image_latents_batch, num_images))
-                input_batch = [torch.cat([uncond, cond]) for (uncond, cond) in zip(uncond_splits, cond_splits)]
-                image_latents_batches[i] = torch.vstack(input_batch)
-            image_latents_batches = torch.stack(image_latents_batches)
-        else:
-            image_latents_batches = cls._split_input_into_batches(image_latents, batch_size, num_dummy_samples)
-
-        return image_latents_batches
-
-    @classmethod
-    def _split_inputs_into_batches(
-        cls,
-        batch_size,
-        latents,
-        image_latents,
-        image_embeddings,
-        added_time_ids,
-        num_images,
-        do_classifier_free_guidance,
-    ):
-        if do_classifier_free_guidance:
-            negative_image_embeddings, image_embeddings = image_embeddings.chunk(2)
-            negative_added_time_ids, added_time_ids = added_time_ids.chunk(2)
-        else:
-            negative_image_embeddings = None
-            negative_added_time_ids = None
-
-        # If the last batch has less samples than batch_size, compute number of dummy samples to pad
-        last_samples = latents.shape[0] % batch_size
-        num_dummy_samples = batch_size - last_samples if last_samples > 0 else 0
-
-        # Generate num_batches batches of size batch_size
-        latents_batches = cls._split_input_into_batches(latents, batch_size, num_dummy_samples)
-        image_latents_batches = cls._split_image_latents_into_batches(
-            image_latents, batch_size, num_dummy_samples, num_images, do_classifier_free_guidance
-        )
-        image_embeddings_batches = cls._split_input_into_batches(
-            image_embeddings, batch_size, num_dummy_samples, negative_image_embeddings
-        )
-        added_time_ids_batches = cls._split_input_into_batches(
-            added_time_ids, batch_size, num_dummy_samples, negative_added_time_ids
-        )
-
-        return (
-            latents_batches,
-            image_latents_batches,
-            image_embeddings_batches,
-            added_time_ids_batches,
-            num_dummy_samples,
-        )
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
-        height: int = 576,
-        width: int = 1024,
-        num_frames: Optional[int] = None,
-        batch_size: int = 1,
-        num_inference_steps: int = 25,
-        min_guidance_scale: float = 1.0,
-        max_guidance_scale: float = 3.0,
-        fps: int = 7,
-        motion_bucket_id: int = 127,
-        noise_aug_strength: float = 0.02,
-        decode_chunk_size: Optional[int] = None,
-        num_videos_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        return_dict: bool = True,
-        **kwargs,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
-                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
-                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
-            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The width in pixels of the generated image.
-            num_frames (`int`, *optional*):
-                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images in a batch.
-            num_inference_steps (`int`, *optional*, defaults to 25):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter is modulated by `strength`.
-            min_guidance_scale (`float`, *optional*, defaults to 1.0):
-                The minimum guidance scale. Used for the classifier free guidance with first frame.
-            max_guidance_scale (`float`, *optional*, defaults to 3.0):
-                The maximum guidance scale. Used for the classifier free guidance with last frame.
-            fps (`int`, *optional*, defaults to 7):
-                Frames per second. The rate at which the generated images shall be exported to a video after generation.
-                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
-            motion_bucket_id (`int`, *optional*, defaults to 127):
-                The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video.
-            noise_aug_strength (`float`, *optional*, defaults to 0.02):
-                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
-            decode_chunk_size (`int`, *optional*):
-                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
-                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
-                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
-
-        Examples:
-
-        ```py
-        from diffusers import StableVideoDiffusionPipeline
-        from diffusers.utils import load_image, export_to_video
-
-        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
-        pipe.to("cuda")
-
-        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
-        image = image.resize((1024, 576))
-
-        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
-        export_to_video(frames, "generated.mp4", fps=7)
-        ```
-        """
-
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
-            # 0. Default height and width to unet
-            height = height or self.unet.config.sample_size * self.vae_scale_factor
-            width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-            num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
-            decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
-
-            # 1. Check inputs. Raise error if not correct
-            self.check_inputs(image, height, width)
-
-            # 2. Define call parameters
-            if isinstance(image, PIL.Image.Image):
-                num_images = 1
-            elif isinstance(image, list):
-                num_images = len(image)
-            else:
-                num_images = image.shape[0]
-            num_batches = ceil((num_videos_per_prompt * num_images) / batch_size)
-            logger.info(
-                f"{num_images} image(s) received, {num_videos_per_prompt} video(s) per prompt,"
-                f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
-            )
-            if num_batches < 3:
-                logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
-
-            device = self._execution_device
-            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-            # corresponds to doing no classifier free guidance.
-            self._guidance_scale = max_guidance_scale
-
-            # 3. Encode input image
-            image_embeddings = self._encode_image(
-                image, device, num_videos_per_prompt, self.do_classifier_free_guidance
-            )
-
-            # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
-            # is why it is reduced here.
-            # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
-            fps = fps - 1
-
-            # 4. Encode input image using VAE
-            image = self.image_processor.preprocess(image, height=height, width=width)
-            # torch.randn is broken on HPU so running it on CPU
-            rand_device = "cpu" if device.type == "hpu" else device
-            noise = randn_tensor(image.shape, generator=generator, device=rand_device, dtype=image.dtype).to(device)
-            # image = self.image_processor.preprocess(image, height=height, width=width).to(device)
-            # noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
-
-            image = image + noise_aug_strength * noise
-
-            needs_upcasting = (
-                self.vae.dtype == torch.float16 or self.vae.dtype == torch.bfloat16
-            ) and self.vae.config.force_upcast
-
-            if needs_upcasting:
-                cast_dtype = self.vae.dtype
-                self.vae.to(dtype=torch.float32)
-
-            # Only encode the conditional image latents and generate unconditional image latents during batch split
-            # The tiling of conditional and unconditional image latents requires special handling
-            image_latents = self._encode_vae_image(
-                image,
-                device=device,
-                num_videos_per_prompt=num_videos_per_prompt,
-                do_classifier_free_guidance=False,  # Override to return only conditional latents
-            )
-            image_latents = image_latents.to(image_embeddings.dtype)
-
-            # cast back to fp16/bf16 if needed
-            if needs_upcasting:
-                self.vae.to(dtype=cast_dtype)
-
-            # Repeat the image latents for each frame so we can concatenate them with the noise
-            # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
-            image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
-
-            # 5. Get Added Time IDs
-            added_time_ids = self._get_add_time_ids(
-                fps,
-                motion_bucket_id,
-                noise_aug_strength,
-                image_embeddings.dtype,
-                num_images,
-                num_videos_per_prompt,
-                self.do_classifier_free_guidance,
-                device,
-            )
-            added_time_ids = added_time_ids.to(device)
-
-            # 6 Prepare timesteps
-            self.scheduler.set_timesteps(num_inference_steps, device=device)
-            timesteps = self.scheduler.timesteps
-            self.scheduler.reset_timestep_dependent_params()
-
-            # 7 Prepare latent variables
-            num_channels_latents = self.unet.config.in_channels
-            latents = self.prepare_latents(
-                num_images * num_videos_per_prompt,
-                num_frames,
-                num_channels_latents,
-                height,
-                width,
-                image_embeddings.dtype,
-                device,
-                generator,
-                latents,
-            )
-
-            # 8. Prepare guidance scale
-            guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
-            guidance_scale = guidance_scale.to(device, latents.dtype)
-            guidance_scale = guidance_scale.repeat(batch_size, 1)
-            guidance_scale = _append_dims(guidance_scale, latents.ndim)
-
-            self._guidance_scale = guidance_scale
-
-            # 9. Split into batches (HPU-specific step)
-            (
-                latents_batches,
-                image_latents_batches,
-                image_embeddings_batches,
-                added_time_ids_batches,
-                num_dummy_samples,
-            ) = self._split_inputs_into_batches(
-                batch_size,
-                latents,
-                image_latents,
-                image_embeddings,
-                added_time_ids,
-                num_images,
-                self.do_classifier_free_guidance,
-            )
-
-            outputs = {
-                "frames": [],
-            }
-            t0 = time.time()
-            t1 = t0
-
-            # 10. Denoising loop
-            throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
-            self._num_timesteps = len(timesteps)
-            for j in self.progress_bar(range(num_batches)):
-                # The throughput is calculated from the 3rd iteration
-                # because compilation occurs in the first two iterations
-                if j == throughput_warmup_steps:
-                    t1 = time.time()
-
-                latents_batch = latents_batches[0]
-                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
-                image_latents_batch = image_latents_batches[0]
-                image_latents_batches = torch.roll(image_latents_batches, shifts=-1, dims=0)
-                image_embeddings_batch = image_embeddings_batches[0]
-                image_embeddings_batches = torch.roll(image_embeddings_batches, shifts=-1, dims=0)
-                added_time_ids_batch = added_time_ids_batches[0]
-                added_time_ids_batches = torch.roll(added_time_ids_batches, shifts=-1, dims=0)
-
-                for i in self.progress_bar(range(num_inference_steps)):
-                    timestep = timesteps[0]
-                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        torch.cat([latents_batch] * 2) if self.do_classifier_free_guidance else latents_batch
-                    )
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
-
-                    # Concatenate image_latents over channels dimention
-                    latent_model_input = torch.cat([latent_model_input, image_latents_batch], dim=2)
-
-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        timestep,
-                        encoder_hidden_states=image_embeddings_batch,
-                        added_time_ids=added_time_ids_batch,
-                        return_dict=False,
-                    )[0]
-
-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents_batch = self.scheduler.step(noise_pred, timestep, latents_batch).prev_sample
-
-                    if callback_on_step_end is not None:
-                        callback_kwargs = {}
-                        for k in callback_on_step_end_tensor_inputs:
-                            callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, timestep, callback_kwargs)
-
-                        latents_batch = callback_outputs.pop("latents", latents_batch)
-
-                if not output_type == "latent":
-                    # cast back to fp16/bf16 if needed
-                    if needs_upcasting:
-                        self.vae.to(dtype=cast_dtype)
-
-                    frames = self.decode_latents(latents_batch, num_frames, decode_chunk_size)
-                    frames = tensor2vid(frames, self.image_processor, output_type=output_type)
-                else:
-                    frames = latents_batch
-
-                outputs["frames"].append(frames)
-
-            speed_metrics_prefix = "generation"
-            speed_measures = speed_metrics(
-                split=speed_metrics_prefix,
-                start_time=t0,
-                num_samples=num_batches * batch_size
-                if t1 == t0
-                else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
-                start_time_after_warmup=t1,
-            )
-            logger.info(f"Speed metrics: {speed_measures}")
-
-            # Remove dummy generations if needed
-            if num_dummy_samples > 0:
-                outputs["frames"][-1] = outputs["frames"][-1][:-num_dummy_samples]
-
-            # Process generated images
-            for i, frames in enumerate(outputs["frames"][:]):
-                if i == 0:
-                    outputs["frames"].clear()
-
-                if output_type == "pil":
-                    outputs["frames"] += frames
-                else:
-                    outputs["frames"] += [*frames]
-
-            self.maybe_free_model_hooks()
-
-            if not return_dict:
-                return outputs["frames"]
-
-            return GaudiStableVideoDiffusionPipelineOutput(
-                frames=outputs["frames"],
-                throughput=speed_measures[f"{speed_metrics_prefix}_samples_per_second"],
-            )
diff --git a/optimum/habana/diffusers/schedulers/__init__.py b/optimum/habana/diffusers/schedulers/__init__.py
deleted file mode 100644
index 37eb80b1a6..0000000000
--- a/optimum/habana/diffusers/schedulers/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .scheduling_ddim import GaudiDDIMScheduler
-from .scheduling_euler_ancestral_discrete import GaudiEulerAncestralDiscreteScheduler
-from .scheduling_euler_discrete import GaudiEulerDiscreteScheduler
diff --git a/optimum/habana/diffusers/schedulers/scheduling_ddim.py b/optimum/habana/diffusers/schedulers/scheduling_ddim.py
deleted file mode 100644
index d15420853f..0000000000
--- a/optimum/habana/diffusers/schedulers/scheduling_ddim.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Stanford University Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
-# and https://github.com/hojonathanho/diffusion
-
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from diffusers.configuration_utils import register_to_config
-from diffusers.schedulers import DDIMScheduler
-from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput
-
-
-class GaudiDDIMScheduler(DDIMScheduler):
-    """
-    Extends [Diffusers' DDIMScheduler](https://huggingface.co/docs/diffusers/api/schedulers#diffusers.DDIMScheduler) to run optimally on Gaudi:
-    - All time-dependent parameters are generated at the beginning
-    - At each time step, tensors are rolled to update the values of the time-dependent parameters
-
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
-            The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
-            The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, *optional*):
-            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        clip_sample (`bool`, defaults to `True`):
-            Clip the predicted sample for numerical stability.
-        clip_sample_range (`float`, defaults to 1.0):
-            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
-        set_alpha_to_one (`bool`, defaults to `True`):
-            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
-            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
-            otherwise it uses the alpha value at step 0.
-        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        thresholding (`bool`, defaults to `False`):
-            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
-            as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
-            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
-            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-        timestep_spacing (`str`, defaults to `"leading"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        rescale_betas_zero_snr (`bool`, defaults to `False`):
-            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
-            dark samples instead of limiting it to samples with medium brightness. Loosely related to
-            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        clip_sample: bool = True,
-        set_alpha_to_one: bool = True,
-        steps_offset: int = 0,
-        prediction_type: str = "epsilon",
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        clip_sample_range: float = 1.0,
-        sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
-        rescale_betas_zero_snr: bool = False,
-    ):
-        super().__init__(
-            num_train_timesteps,
-            beta_start,
-            beta_end,
-            beta_schedule,
-            trained_betas,
-            clip_sample,
-            set_alpha_to_one,
-            steps_offset,
-            prediction_type,
-            thresholding,
-            dynamic_thresholding_ratio,
-            clip_sample_range,
-            sample_max_value,
-            timestep_spacing,
-            rescale_betas_zero_snr,
-        )
-
-        self.reset_timestep_dependent_params()
-
-    def reset_timestep_dependent_params(self):
-        self.are_timestep_dependent_params_set = False
-        self.alpha_prod_t_list = []
-        self.alpha_prod_t_prev_list = []
-        self.variance_list = []
-
-    def get_params(self, timestep: Optional[int] = None):
-        """
-        Initialize the time-dependent parameters, and retrieve the time-dependent
-        parameters at each timestep. The tensors are rolled in a separate function
-        at the end of the scheduler step in case parameters are retrieved multiple
-        times in a timestep, e.g., when scaling model inputs and in the scheduler step.
-
-        Args:
-            timestep (`int`, optional):
-                The current discrete timestep in the diffusion chain. Optionally used to
-                initialize parameters in cases which start in the middle of the
-                denoising schedule (e.g. for image-to-image).
-        """
-        if not self.are_timestep_dependent_params_set:
-            prev_timesteps = self.timesteps - self.config.num_train_timesteps // self.num_inference_steps
-            for t, prev_t in zip(self.timesteps, prev_timesteps):
-                alpha_prod_t = self.alphas_cumprod[t]
-                alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.final_alpha_cumprod
-
-                self.alpha_prod_t_list.append(alpha_prod_t)
-                self.alpha_prod_t_prev_list.append(alpha_prod_t_prev)
-                self.variance_list.append(self._get_variance(alpha_prod_t, alpha_prod_t_prev))
-
-            self.alpha_prod_t_list = torch.stack(self.alpha_prod_t_list)
-            self.alpha_prod_t_prev_list = torch.stack(self.alpha_prod_t_prev_list)
-            self.variance_list = torch.stack(self.variance_list)
-            self.are_timestep_dependent_params_set = True
-
-        alpha_prod_t = self.alpha_prod_t_list[0]
-        alpha_prod_t_prev = self.alpha_prod_t_prev_list[0]
-        variance = self.variance_list[0]
-
-        return alpha_prod_t, alpha_prod_t_prev, variance
-
-    def roll_params(self):
-        """
-        Roll tensors to update the values of the time-dependent parameters at each timestep.
-        """
-        if self.are_timestep_dependent_params_set:
-            self.alpha_prod_t_list = torch.roll(self.alpha_prod_t_list, shifts=-1, dims=0)
-            self.alpha_prod_t_prev_list = torch.roll(self.alpha_prod_t_prev_list, shifts=-1, dims=0)
-            self.variance_list = torch.roll(self.variance_list, shifts=-1, dims=0)
-        else:
-            raise ValueError("Time-dependent parameters should be set first.")
-        return
-
-    # def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
-    #     """
-    #     Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-    #     current timestep.
-    #     Args:
-    #         sample (`torch.FloatTensor`): input sample
-    #         timestep (`int`, optional): current timestep
-    #     Returns:
-    #         `torch.FloatTensor`: scaled input sample
-    #     """
-    #     return sample
-
-    def _get_variance(self, alpha_prod_t, alpha_prod_t_prev):
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
-
-        return variance
-
-    def step(
-        self,
-        model_output: torch.FloatTensor,
-        timestep: int,
-        sample: torch.FloatTensor,
-        eta: float = 0.0,
-        use_clipped_model_output: bool = False,
-        generator=None,
-        variance_noise: Optional[torch.FloatTensor] = None,
-        return_dict: bool = True,
-    ) -> Union[DDIMSchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`torch.FloatTensor`):
-                The direct output from learned diffusion model.
-            sample (`torch.FloatTensor`):
-                A current instance of a sample created by the diffusion process.
-            eta (`float`):
-                The weight of noise for added noise in diffusion step.
-            use_clipped_model_output (`bool`, defaults to `False`):
-                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
-                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
-                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
-                `use_clipped_model_output` has no effect.
-            generator (`torch.Generator`, *optional*):
-                A random number generator.
-            variance_noise (`torch.FloatTensor`):
-                Alternative to generating noise with `generator` by directly providing the noise for the variance
-                itself. Useful for methods such as [`CycleDiffusion`].
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~diffusers.schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
-
-        Returns:
-            [`diffusers.schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~diffusers.schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
-        # Ideally, read DDIM paper in-detail understanding
-
-        # Notation (<variable name> -> <name in paper>
-        # - pred_noise_t -> e_theta(x_t, t)
-        # - pred_original_sample -> f_theta(x_t, t) or x_0
-        # - std_dev_t -> sigma_t
-        # - eta -> η
-        # - pred_sample_direction -> "direction pointing to x_t"
-        # - pred_prev_sample -> "x_t-1"
-
-        # 1. get previous step value (=t-1)
-        # Done in self.get_params() below
-
-        # 2. compute alphas, betas
-        alpha_prod_t, alpha_prod_t_prev, variance = self.get_params(timestep)
-        beta_prod_t = 1 - alpha_prod_t
-
-        # 3. compute predicted original sample from predicted noise also called
-        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-            pred_epsilon = model_output
-        elif self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
-        elif self.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                " `v_prediction`"
-            )
-
-        # 4. Clip or threshold "predicted x_0"
-        if self.config.thresholding:
-            pred_original_sample = self._threshold_sample(pred_original_sample)
-        elif self.config.clip_sample:
-            pred_original_sample = pred_original_sample.clamp(
-                -self.config.clip_sample_range, self.config.clip_sample_range
-            )
-
-        # 5. compute variance: "sigma_t(η)" -> see formula (16)
-        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
-        std_dev_t = eta * variance ** (0.5)
-
-        if use_clipped_model_output:
-            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
-            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
-
-        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
-
-        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
-
-        if eta > 0:
-            device = model_output.device
-            if variance_noise is not None and generator is not None:
-                raise ValueError(
-                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
-                    " `variance_noise` stays `None`."
-                )
-
-            if variance_noise is None:
-                # torch.randn is broken on HPU so running it on CPU
-                variance_noise = torch.randn(
-                    model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator
-                )
-                if device.type == "hpu":
-                    variance_noise = variance_noise.to(device)
-
-            prev_sample = prev_sample + std_dev_t * variance_noise
-
-        # Roll parameters for next timestep
-        self.roll_params()
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
-
-    def add_noise(
-        self,
-        original_samples: torch.FloatTensor,
-        noise: torch.FloatTensor,
-        timesteps: torch.IntTensor,
-    ) -> torch.FloatTensor:
-        # Make sure alphas_cumprod has same device and dtype as original_samples
-        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
-        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
-        timesteps = timesteps.to(original_samples.device)
-
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
diff --git a/optimum/habana/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/optimum/habana/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
deleted file mode 100644
index d2c4792f19..0000000000
--- a/optimum/habana/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright 2023 Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from diffusers.configuration_utils import register_to_config
-from diffusers.schedulers import EulerAncestralDiscreteScheduler
-from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteSchedulerOutput
-
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class GaudiEulerAncestralDiscreteScheduler(EulerAncestralDiscreteScheduler):
-    """
-    Extends [Diffusers' EulerAncestralDiscreteScheduler](https://huggingface.co/docs/diffusers/en/api/schedulers/euler_ancestral) to run optimally on Gaudi:
-    - All time-dependent parameters are generated at the beginning
-    - At each time step, tensors are rolled to update the values of the time-dependent parameters
-
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
-            The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
-            The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, *optional*):
-            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        timestep_spacing (`str`, defaults to `"linspace"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
-        rescale_betas_zero_snr (`bool`, defaults to `False`):
-            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
-            dark samples instead of limiting it to samples with medium brightness. Loosely related to
-            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        timestep_spacing: str = "linspace",
-        steps_offset: int = 0,
-        rescale_betas_zero_snr: bool = False,
-    ):
-        super().__init__(
-            num_train_timesteps,
-            beta_start,
-            beta_end,
-            beta_schedule,
-            trained_betas,
-            prediction_type,
-            timestep_spacing,
-            steps_offset,
-        )
-
-        self._initial_timestep = None
-        self.reset_timestep_dependent_params()
-
-    def reset_timestep_dependent_params(self):
-        self.are_timestep_dependent_params_set = False
-        self.sigma_t_list = []
-        self.sigma_up_t_list = []
-        self.sigma_down_t_list = []
-
-    def get_params(self, timestep: Union[float, torch.FloatTensor]):
-        """
-        Initialize the time-dependent parameters, and retrieve the time-dependent
-        parameters at each timestep. The tensors are rolled in a separate function
-        at the end of the scheduler step in case parameters are retrieved multiple
-        times in a timestep, e.g., when scaling model inputs and in the scheduler step.
-
-        Args:
-            timestep (`float`):
-                The current discrete timestep in the diffusion chain. Optionally used to
-                initialize parameters in cases which start in the middle of the
-                denoising schedule (e.g. for image-to-image)
-        """
-        if self.step_index is None:
-            self._init_step_index(timestep)
-
-        if not self.are_timestep_dependent_params_set:
-            sigmas_from = self.sigmas[self.step_index : -1]
-            sigmas_to = self.sigmas[(self.step_index + 1) :]
-
-            for sigma_from, sigma_to in zip(sigmas_from, sigmas_to):
-                sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
-                sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
-
-                self.sigma_t_list.append(sigma_from)
-                self.sigma_up_t_list.append(sigma_up)
-                self.sigma_down_t_list.append(sigma_down)
-
-            self.sigma_t_list = torch.stack(self.sigma_t_list)
-            self.sigma_up_t_list = torch.stack(self.sigma_up_t_list)
-            self.sigma_down_t_list = torch.stack(self.sigma_down_t_list)
-            self.are_timestep_dependent_params_set = True
-
-        sigma = self.sigma_t_list[0]
-        sigma_up = self.sigma_up_t_list[0]
-        sigma_down = self.sigma_down_t_list[0]
-
-        return sigma, sigma_up, sigma_down
-
-    def roll_params(self):
-        """
-        Roll tensors to update the values of the time-dependent parameters at each timestep.
-        """
-        if self.are_timestep_dependent_params_set:
-            self.sigma_t_list = torch.roll(self.sigma_t_list, shifts=-1, dims=0)
-            self.sigma_up_t_list = torch.roll(self.sigma_up_t_list, shifts=-1, dims=0)
-            self.sigma_down_t_list = torch.roll(self.sigma_down_t_list, shifts=-1, dims=0)
-        else:
-            raise ValueError("Time-dependent parameters should be set first.")
-        return
-
-    def scale_model_input(
-        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
-    ) -> torch.FloatTensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
-
-        Args:
-            sample (`torch.FloatTensor`):
-                The input sample.
-            timestep (`int`, *optional*):
-                The current timestep in the diffusion chain.
-
-        Returns:
-            `torch.FloatTensor`:
-                A scaled input sample.
-        """
-
-        sigma, _, _ = self.get_params(timestep)
-        sample = sample / ((sigma**2 + 1) ** 0.5)
-        self.is_scale_input_called = True
-        return sample
-
-    def step(
-        self,
-        model_output: torch.FloatTensor,
-        timestep: Union[float, torch.FloatTensor],
-        sample: torch.FloatTensor,
-        generator: Optional[torch.Generator] = None,
-        return_dict: bool = True,
-    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`torch.FloatTensor`):
-                The direct output from learned diffusion model.
-            timestep (`float`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.FloatTensor`):
-                A current instance of a sample created by the diffusion process.
-            generator (`torch.Generator`, *optional*):
-                A random number generator.
-            return_dict (`bool`):
-                Whether or not to return a
-                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
-
-        Returns:
-            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
-                If return_dict is `True`,
-                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
-                otherwise a tuple is returned where the first element is the sample tensor.
-
-        """
-
-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
-            raise ValueError(
-                (
-                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
-                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
-                    " one of the `scheduler.timesteps` as a timestep."
-                ),
-            )
-
-        if not self.is_scale_input_called:
-            logger.warning(
-                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example."
-            )
-
-        # Upcast to avoid precision issues when computing prev_sample
-        sample = sample.to(torch.float32)
-
-        sigma, sigma_up, sigma_down = self.get_params(timestep)
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = sample - sigma * model_output
-        elif self.config.prediction_type == "v_prediction":
-            # * c_out + input * c_skip
-            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
-        elif self.config.prediction_type == "sample":
-            raise NotImplementedError("prediction_type not implemented yet: sample")
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-            )
-
-        # 2. Convert to an ODE derivative
-        derivative = (sample - pred_original_sample) / sigma
-
-        dt = sigma_down - sigma
-
-        prev_sample = sample + derivative * dt
-
-        device = model_output.device
-
-        # torch.randn is broken on HPU so running it on CPU
-        noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator)
-        if device.type == "hpu":
-            noise = noise.to(device)
-
-        prev_sample = prev_sample + noise * sigma_up
-
-        # Cast sample back to model compatible dtype
-        prev_sample = prev_sample.to(model_output.dtype)
-
-        # upon completion increase step index by one
-        self._step_index += 1
-        self.roll_params()
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return EulerAncestralDiscreteSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample
-        )
diff --git a/optimum/habana/diffusers/schedulers/scheduling_euler_discrete.py b/optimum/habana/diffusers/schedulers/scheduling_euler_discrete.py
deleted file mode 100644
index aef5649cd0..0000000000
--- a/optimum/habana/diffusers/schedulers/scheduling_euler_discrete.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# Copyright 2023 Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from diffusers import EulerDiscreteScheduler
-from diffusers.configuration_utils import register_to_config
-from diffusers.schedulers.scheduling_euler_discrete import EulerDiscreteSchedulerOutput
-from diffusers.utils.torch_utils import randn_tensor
-
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-class GaudiEulerDiscreteScheduler(EulerDiscreteScheduler):
-    """
-    Extends [Diffusers' EulerDiscreteScheduler](https://huggingface.co/docs/diffusers/api/schedulers#diffusers.EulerDiscreteScheduler) to run optimally on Gaudi:
-    - All time-dependent parameters are generated at the beginning
-    - At each time step, tensors are rolled to update the values of the time-dependent parameters
-
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
-            The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
-            The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, *optional*):
-            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        interpolation_type(`str`, defaults to `"linear"`, *optional*):
-            The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be on of
-            `"linear"` or `"log_linear"`.
-        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
-            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
-            the sigmas are determined according to a sequence of noise levels {σi}.
-        timestep_spacing (`str`, defaults to `"linspace"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
-            Diffusion.
-        rescale_betas_zero_snr (`bool`, defaults to `False`):
-            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
-            dark samples instead of limiting it to samples with medium brightness. Loosely related to
-            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        interpolation_type: str = "linear",
-        use_karras_sigmas: Optional[bool] = False,
-        sigma_min: Optional[float] = None,
-        sigma_max: Optional[float] = None,
-        timestep_spacing: str = "linspace",
-        timestep_type: str = "discrete",  # can be "discrete" or "continuous"
-        steps_offset: int = 0,
-        rescale_betas_zero_snr: bool = False,
-    ):
-        super().__init__(
-            num_train_timesteps,
-            beta_start,
-            beta_end,
-            beta_schedule,
-            trained_betas,
-            prediction_type,
-            interpolation_type,
-            use_karras_sigmas,
-            sigma_min,
-            sigma_max,
-            timestep_spacing,
-            timestep_type,
-            steps_offset,
-            rescale_betas_zero_snr,
-        )
-
-        self._initial_timestep = None
-        self.reset_timestep_dependent_params()
-        self.hpu_opt = False
-
-    def reset_timestep_dependent_params(self):
-        self.are_timestep_dependent_params_set = False
-        self.sigma_list = []
-        self.sigma_next_list = []
-
-    def get_params(self, timestep: Union[float, torch.FloatTensor]):
-        if self.step_index is None:
-            self._init_step_index(timestep)
-
-        if not self.are_timestep_dependent_params_set:
-            sigmas = self.sigmas[self.step_index : -1]
-            sigmas_next = self.sigmas[(self.step_index + 1) :]
-
-            for sigma, sigma_next in zip(sigmas, sigmas_next):
-                self.sigma_list.append(sigma)
-                self.sigma_next_list.append(sigma_next)
-
-            self.sigma_list = torch.stack(self.sigma_list)
-            self.sigma_next_list = torch.stack(self.sigma_next_list)
-            self.are_timestep_dependent_params_set = True
-
-        sigma = self.sigma_list[0]
-        sigma_next = self.sigma_next_list[0]
-
-        return sigma, sigma_next
-
-    def roll_params(self):
-        """
-        Roll tensors to update the values of the time-dependent parameters at each timestep.
-        """
-        if self.are_timestep_dependent_params_set:
-            self.sigma_list = torch.roll(self.sigma_list, shifts=-1, dims=0)
-            self.sigma_next_list = torch.roll(self.sigma_next_list, shifts=-1, dims=0)
-        else:
-            raise ValueError("Time-dependent parameters should be set first.")
-        return
-
-    def scale_model_input(
-        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
-    ) -> torch.FloatTensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
-
-        Args:
-            sample (`torch.FloatTensor`):
-                The input sample.
-            timestep (`int`, *optional*):
-                The current timestep in the diffusion chain.
-
-        Returns:
-            `torch.FloatTensor`:
-                A scaled input sample.
-        """
-
-        if self.hpu_opt:
-            if self.step_index is None:
-                self._init_step_index(timestep)
-            self.sigmas = self.sigmas.to(sample.dtype)
-            sigma = self.sigmas[self.step_index]
-        else:
-            sigma, _ = self.get_params(timestep)
-        sample = sample / ((sigma**2 + 1) ** 0.5)
-        self.is_scale_input_called = True
-        return sample
-
-    def step(
-        self,
-        model_output: torch.FloatTensor,
-        timestep: Union[float, torch.FloatTensor],
-        sample: torch.FloatTensor,
-        s_churn: float = 0.0,
-        s_tmin: float = 0.0,
-        s_tmax: float = float("inf"),
-        s_noise: float = 1.0,
-        generator: Optional[torch.Generator] = None,
-        return_dict: bool = True,
-    ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`torch.FloatTensor`):
-                The direct output from learned diffusion model.
-            timestep (`float`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.FloatTensor`):
-                A current instance of a sample created by the diffusion process.
-            s_churn (`float`):
-            s_tmin  (`float`):
-            s_tmax  (`float`):
-            s_noise (`float`, defaults to 1.0):
-                Scaling factor for noise added to the sample.
-            generator (`torch.Generator`, *optional*):
-                A random number generator.
-            return_dict (`bool`):
-                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
-                tuple.
-
-        Returns:
-            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
-                returned, otherwise a tuple is returned where the first element is the sample tensor.
-        """
-
-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
-            raise ValueError(
-                (
-                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
-                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
-                    " one of the `scheduler.timesteps` as a timestep."
-                ),
-            )
-
-        if not self.is_scale_input_called:
-            logger.warning(
-                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example."
-            )
-
-        if self.hpu_opt and self.step_index is None:
-            self._init_step_index(timestep)
-
-        # Upcast to avoid precision issues when computing prev_sample
-        sample = sample.to(torch.float32)
-
-        if self.hpu_opt:
-            sigma = self.sigmas[self.step_index]
-        else:
-            sigma, sigma_next = self.get_params(timestep)
-
-        if self.hpu_opt and sigma.device.type == 'hpu':
-            gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1)
-        else:
-            gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
-
-        if self.hpu_opt:
-            noise = randn_tensor(
-                model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
-            )
-        else:
-            device = model_output.device
-
-            # torch.randn is broken on HPU so running it on CPU
-            noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator)
-            if device.type == "hpu":
-                noise = noise.to(device)
-
-        eps = noise * s_noise
-        sigma_hat = sigma * (gamma + 1)
-
-        if gamma > 0:
-            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
-        # backwards compatibility
-        if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-        elif self.config.prediction_type == "epsilon":
-            pred_original_sample = sample - sigma_hat * model_output
-        elif self.config.prediction_type == "v_prediction":
-            # denoised = model_output * c_out + input * c_skip
-            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-            )
-
-        # 2. Convert to an ODE derivative
-        derivative = (sample - pred_original_sample) / sigma_hat
-
-        if self.hpu_opt:
-            dt = self.sigmas[self.step_index + 1] - sigma_hat
-        else:
-            dt = sigma_next - sigma_hat
-
-        prev_sample = sample + derivative * dt
-
-        # Cast sample back to model compatible dtype
-        prev_sample = prev_sample.to(model_output.dtype)
-
-        # upon completion increase step index by one
-        self._step_index += 1
-        if not self.hpu_opt:
-            self.roll_params()
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
diff --git a/optimum/habana/distributed/__init__.py b/optimum/habana/distributed/__init__.py
deleted file mode 100644
index af269ee68c..0000000000
--- a/optimum/habana/distributed/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import os
-
-import torch
-
-from .distributed_runner import DistributedRunner
-from .fast_ddp import all_reduce_gradients
-
-
-def rank_and_world(group=None):
-    """
-    Returns (rank, world_size) from the optionally-specified group, otherwise
-    from the default group, or if non-distributed just returns (0, 1)
-    """
-    if torch.distributed.is_initialized() and group is None:
-        group = torch.distributed.GroupMember.WORLD
-
-    if group is None:
-        world_size = 1
-        rank = 0
-    else:
-        world_size = group.size()
-        rank = group.rank()
-
-    return rank, world_size
-
-
-_LOCAL_RANK = int(os.getenv("LOCAL_RANK", 0))
-
-
-def local_rank():
-    return _LOCAL_RANK
diff --git a/optimum/habana/distributed/distributed_runner.py b/optimum/habana/distributed/distributed_runner.py
deleted file mode 100644
index 91911b826c..0000000000
--- a/optimum/habana/distributed/distributed_runner.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-###############################################################################
-# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-import os
-import subprocess
-import sys
-from pathlib import Path
-from typing import List, Union
-
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class DistributedRunner:
-    """
-    Set up training/inference hardware configurations and run distributed commands.
-    """
-
-    def __init__(
-        self,
-        command_list: List = [],
-        world_size: int = 1,
-        hostfile: Union[str, Path] = None,
-        use_mpi: bool = False,
-        use_deepspeed: bool = False,
-        master_port: int = 29500,
-        use_env: bool = False,
-        map_by: bool = "socket",
-        multi_hls=None,
-    ):
-        """
-        The `DistributedRunner` enables to exectute a command in a distributed way:
-        - On one Gaudi server with MPI, DeepSpeed or `torch.distributed`
-        - On several nodes with DeepSpeed.
-
-        Args:
-            command_list (List, optional): The list of commands to execute. Defaults to [].
-            world_size (int, optional): The number of devices to use. This is only used for single-node runs. Defaults to 1.
-            hostfile (Union[str, Path], optional): The path to the hostfile specifying the IP addresses and the number of devices to use for each node. This is only used for multi-node runs. Defaults to None.
-            use_mpi (bool, optional): Whether to use OpenMPI for the communication between devices. Defaults to False.
-            use_deepspeed (bool, optional): Wheter to use DeepSpeed. Defaults to False.
-            use_env (bool, optional): Whether to use `--use_env` with `torch.distributed`. Defaults to False.
-            map_by (bool, optional): The mapping unit used for assigning processes with MPI. Defaults to "socket".
-        """
-
-        logging.set_verbosity(logging.INFO)
-        logging.enable_default_handler()
-        logging.enable_explicit_format()
-
-        self._commands = command_list
-        self._world_size = world_size
-        self._hostfile = hostfile
-        self._map_by = map_by
-        self._master_port = master_port
-        self._use_env = use_env
-        self._interpreter = f"{sys.executable} "
-
-        self._model_env_vars = {}
-
-        # TODO: remove multi_hls
-        if multi_hls is not None:
-            logger.warning("`multi_hls` is deprecated and will be removed in a future version.")
-
-        if use_deepspeed and use_mpi:
-            raise ValueError("`use_mpi` and `use_deepspeed` cannot be both True.")
-
-        if hostfile is not None:
-            if isinstance(self._hostfile, str):
-                self._hostfile = Path(self._hostfile)
-            # Multi-node run
-            if use_deepspeed:
-                self.create_multi_node_setup()
-            else:
-                raise ValueError(
-                    "A hostfile is specified to perform a multi-node run. This requires to enable DeepSpeed with"
-                    " `use_deepspeed=True`."
-                )
-        elif self._world_size > 1:
-            # Distributed run
-            if use_deepspeed:
-                # Single-node multi-card run with DeepSpeed
-                self.create_single_node_setup_deepspeed()
-            elif use_mpi:
-                # Single-node multi-card run with MPI
-                self._model_env_vars["MASTER_ADDR"] = "localhost"
-                self._model_env_vars["MASTER_PORT"] = self._master_port
-                self.create_single_node_setup_mpirun()
-            else:
-                # Single-node multi-card run with torch.distributed
-                self.create_single_node_setup()
-        else:
-            # Single-card run
-            logger.warning(
-                "The run will be executed on one device only. Specify `world_size` > 1 or `hostfile` to perform a"
-                " distributed run."
-            )
-            self.create_single_card_setup(use_deepspeed)
-
-    def get_peval(self):
-        cmd1 = r"lscpu 2>/dev/null | awk '/Socket\(s\)/  { print $2 }'"
-        cmd2 = r"lscpu 2>/dev/null | awk '/Core\(s\) per socket/  { print $4 }'"
-        with subprocess.Popen(
-            cmd1, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-        ) as proc:
-            lscpu_output1 = proc.stdout.read()
-        with subprocess.Popen(
-            cmd2, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT
-        ) as proc:
-            lscpu_output2 = proc.stdout.read()
-        sockets = int(lscpu_output1)
-        corespsocket = int(lscpu_output2)
-        if corespsocket == 1:  # running inside VM?
-            logger.warning(f"Cores per socket is {corespsocket}. Running inside a VM?")
-            logger.warning("Mapping by slot instead of socket")
-            self._map_by = "slot"
-        if self._hostfile:
-            _hls_list = str(os.getenv("MULTI_HLS_IPS", "")).split(",")
-            _world_size = 8
-            _per_node_processes = int(_world_size / len(_hls_list))
-            peval = (sockets * corespsocket) // _per_node_processes
-        else:
-            peval = (sockets * corespsocket) // self._world_size
-        return peval, sockets, corespsocket
-
-    def setup_config_env_mpirun(self):
-        peval, _, _ = self.get_peval()
-        return f"--map-by {self._map_by}:PE={peval}"
-
-    def create_single_card_setup(self, use_deepspeed=False):
-        """
-        Single-card setup.
-        """
-
-        if use_deepspeed:
-            self._interpreter = f"deepspeed --num_gpus 1 --master_port {self._master_port} "
-        else:
-            self._interpreter = f"{sys.executable} "
-
-    def create_single_node_setup_mpirun(self):
-        """
-        Single-node multi-card configuration setup for mpirun.
-        """
-
-        mpi_cmd = self.setup_config_env_mpirun()
-        self._interpreter = (
-            f"mpirun -n {self._world_size} --bind-to core {mpi_cmd} --rank-by core --report-bindings"
-            f" --allow-run-as-root {sys.executable} "
-        )
-
-    def create_single_node_setup_deepspeed(self):
-        """
-        Single-node multi-card configuration setup for DeepSpeed.
-        """
-
-        self._interpreter = (
-            f"deepspeed --num_nodes 1 --num_gpus {self._world_size} --no_local_rank --master_port {self._master_port} "
-        )
-
-    def create_single_node_setup(self):
-        """
-        Single-node multi-card configuration setup.
-        """
-
-        use_env_param = "--use_env" if self._use_env else ""
-
-        self._interpreter = (
-            f"{sys.executable} -um torch.distributed.run --nproc_per_node={self._world_size} {use_env_param} "
-        )
-
-    def create_multi_node_setup(self):
-        """
-        Multi-node configuration setup for DeepSpeed.
-        """
-
-        master_addr = self.process_hostfile()
-        self._interpreter = f"deepspeed --hostfile {self._hostfile} --master_addr {master_addr} --no_local_rank --master_port {self._master_port} "
-
-    def run(self):
-        """
-        Runs the desired command with configuration specified by the user.
-        """
-
-        try:
-            if self._model_env_vars:
-                print("Running with the following model specific env vars: ")
-                for env_name, env_val in [
-                    *self._model_env_vars.items()
-                ]:  # iterate key value pairs of self._model_env_vars
-                    print(f"{env_name}={env_val}")
-                    if "LD_PRELOAD" in str(env_name) and os.environ.get(str(env_name), None):
-                        os.environ[str(env_name)] = str(env_val) + ":" + os.environ.get(str(env_name), None)
-                    else:
-                        os.environ[str(env_name)] = str(env_val)
-            for command in self._commands:
-                command = self._interpreter + command
-                print(f"{self.__class__.__name__} run(): command = {command}")
-                sys.stdout.flush()
-                sys.stderr.flush()
-                with subprocess.Popen(command, shell=True, executable="/bin/bash") as proc:
-                    proc.wait()
-                    sys.stdout.flush()
-                    sys.stderr.flush()
-                    if proc.returncode != 0:
-                        logger.error(f"{command}  exited with status = {proc.returncode}")
-                        return proc.returncode
-            if self._model_env_vars:
-                for env_name in [*self._model_env_vars.keys()]:  # iterate keys of self._model_env_vars
-                    del os.environ[str(env_name)]
-        except Exception as exc:
-            raise RuntimeError(f"Error in {self.__class__.__name__} run()") from exc
-
-    def process_hostfile(self) -> str:
-        """
-        Returns the master address to use for multi-node runs with DeepSpeed.
-        Directly inspired from https://github.com/microsoft/DeepSpeed/blob/316c4a43e0802a979951ee17f735daf77ea9780f/deepspeed/autotuning/utils.py#L145.
-
-        Returns:
-            str: address of the master node.
-        """
-        if not self._hostfile.is_file():
-            raise ValueError(f"Unable to find hostfile at {self._hostfile}.")
-
-        # e.g., worker-0 slots=16
-        with self._hostfile.open("r") as file:
-            resource_pool = {}
-            master_addr = None
-            for line in file.readlines():
-                line = line.strip()
-                if line == "":
-                    # skip empty lines
-                    continue
-                try:
-                    hostname, slots = line.split()
-                    _, slot_count = slots.split("=")
-                    slot_count = int(slot_count)
-                    if master_addr is None:
-                        master_addr = hostname
-                except ValueError as err:
-                    logger.error("Hostfile is not formatted correctly, unable to proceed with training/inference.")
-                    raise err
-                if hostname in resource_pool:
-                    logger.error("Hostfile contains duplicate hosts, unable to proceed with training/inference.")
-                    raise ValueError(f"Host {hostname} is already defined")
-                resource_pool[hostname] = slot_count
-
-        return master_addr
diff --git a/optimum/habana/distributed/fast_ddp.py b/optimum/habana/distributed/fast_ddp.py
deleted file mode 100644
index ec01cb72f8..0000000000
--- a/optimum/habana/distributed/fast_ddp.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-###############################################################################
-# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-"""
-Fast and lightweight alternative to DistributeDataParallel for Habana Gaudi
-"""
-
-import torch
-
-
-def all_reduce_gradients(
-    model: torch.nn.Module, fusion_buffer_dtype: torch.dtype = torch.bfloat16, use_hpu_graphs: bool = True
-):
-    """
-    Invokes an all-reduce operation on the gradients supporting data parallel training.
-
-    This function is meant to be called after forward+backward passes, where the gradient information is available in the model parameters.
-    Once called, the list of gradients participating in the training process must remain the same.
-
-    Args:
-        model (torch.nn.Module): A model whose gradients are meant to be all-reduced.
-        fusion_buffer_dtype (torch.dtype): The dtype of internally allocated gradient fusion buffer.
-        use_hpu_graphs (bool): Determines whether HPU graph recording should be used for packing and unpacking the gradients.
-
-    Raises:
-        NotImplementedError: `all_reduce_gradients()` does not support changing the set of active gradients after first invocation.
-    """
-
-    # Try to get the existing fusion buffer created for the model.
-    fusion_entries = model.__dict__.get("_all_reduce_fusion_entries", None)
-    if fusion_entries is not None:
-        if len(fusion_entries) == 0:
-            # There is nothing to all-reduce, neither the fusion buffer.
-            return
-
-        fusion_buffer = model._all_reduce_fusion_buffer
-        if use_hpu_graphs:
-            pack_graph = model._all_reduce_gradient_pack_graph
-            unpack_graph = model._all_reduce_gradient_unpack_graph
-    else:
-        # Count the total number of elements of the reduced gradients.
-        grad_elem_count = 0
-        for param in model.parameters():
-            if param.grad is None:
-                continue
-            grad_elem_count += torch.numel(param.grad)
-
-        # There is nothing to all-reduce.
-        if grad_elem_count == 0:
-            model.__dict__["_all_reduce_fusion_entries"] = []
-            return
-
-        # Allocate the fusion buffer and associate it with the model.
-        fusion_buffer = torch.zeros(size=(grad_elem_count,), dtype=fusion_buffer_dtype, device="hpu:0")
-        model.__dict__["_all_reduce_fusion_buffer"] = fusion_buffer
-
-        # Build the fusion information necessary for gradient packing and unpacking processes.
-        grad_elem_count = 0
-        fusion_entries = []
-        for param in model.parameters():
-            if param.grad is None:
-                continue
-            grad_numel = torch.numel(param.grad)
-            fused_view = fusion_buffer[grad_elem_count : grad_elem_count + grad_numel].reshape(param.grad.shape)
-            fusion_entries.append((param, fused_view))
-            grad_elem_count += grad_numel
-        model.__dict__["_all_reduce_fusion_entries"] = fusion_entries
-
-        # Instruct the following logic to record packing and unpacking HPU graphs based on the newly created fusion buffer.
-        if use_hpu_graphs:
-            pack_graph = None
-            unpack_graph = None
-
-    # Pack the gradients into the fusion buffer.
-    def pack_grads():
-        world_size_inv = 1.0 / torch.distributed.group.WORLD.size()
-        for param, fused_view in fusion_entries:
-            grad = param.grad
-
-            if grad is None:
-                raise NotImplementedError(
-                    "`all_reduce_gradients()` does not support changing the set of active gradients after first invocation."
-                )
-
-            if grad.dtype != fusion_buffer_dtype:
-                grad = grad.to(fusion_buffer_dtype)
-            grad = grad * world_size_inv
-            fused_view.copy_(grad, non_blocking=True)
-
-    if use_hpu_graphs:
-        if pack_graph is None:
-            import habana_frameworks.torch as ht
-
-            pack_graph = ht.hpu.HPUGraph()
-            with ht.hpu.stream(ht.hpu.Stream()):
-                pack_graph.capture_begin()
-                pack_grads()
-                pack_graph.capture_end()
-            model.__dict__["_all_reduce_gradient_pack_graph"] = pack_graph
-
-        pack_graph.replay()
-    else:
-        pack_grads()
-
-    # Invoke an all-reduce operation of the fused gradients.
-    torch.distributed.all_reduce(fusion_buffer, group=torch.distributed.group.WORLD, async_op=True)
-
-    # Unpack the gradients back to the model parameters.
-    def unpack_grads():
-        for param, fused_view in fusion_entries:
-            grad = param.grad
-
-            if grad is None:
-                raise NotImplementedError(
-                    "`all_reduce_gradients()` does not support changing the set of active gradients after first invocation."
-                )
-
-            if fused_view.dtype != grad.dtype:
-                fused_view = fused_view.to(grad.dtype)
-
-            grad.copy_(fused_view, non_blocking=True)
-
-    if use_hpu_graphs:
-        if unpack_graph is None:
-            import habana_frameworks.torch as ht
-
-            unpack_graph = ht.hpu.HPUGraph()
-            with ht.hpu.stream(ht.hpu.Stream()):
-                unpack_graph.capture_begin()
-                unpack_grads()
-                unpack_graph.capture_end()
-            model.__dict__["_all_reduce_gradient_unpack_graph"] = unpack_graph
-
-        unpack_graph.replay()
-    else:
-        unpack_grads()
diff --git a/optimum/habana/distributed/serialization.py b/optimum/habana/distributed/serialization.py
deleted file mode 100644
index bf59fb2445..0000000000
--- a/optimum/habana/distributed/serialization.py
+++ /dev/null
@@ -1,475 +0,0 @@
-# Copyright 2024 The Foundation Model Stack Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This file has been modified from its original version.
-# The original version can be found at https://github.com/foundation-model-stack/foundation-model-stack
-
-import collections
-import os
-from collections import ChainMap
-from collections.abc import Iterable
-from pathlib import Path
-from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Union
-
-import torch
-
-from .tp import TPModule
-
-
-__adapters: MutableMapping[str, MutableMapping[str, Callable[[Mapping], Mapping]]] = {}
-
-
-def register_adapter(
-    architecture: str,
-    source: str,
-    adapter: Callable[[Mapping], Mapping],
-):
-    """
-    Registers a state dict adapter to be available to the (de) serialization
-    API.
-
-    Args:
-    architecture: The name of the model architecture, e.g. 'llama'
-    source: A label representing the format of the weights to be converted.
-            E.g. 'hf'
-    adapter: the class of the adapter. The class must accept one constructor
-                parameter, which will be a state dict (`OrderedDict`)
-    """
-    sources: MutableMapping[str, Callable[[Mapping], Mapping]] = {}
-    if architecture in __adapters:
-        sources = __adapters[architecture]
-
-    if source in sources:
-        raise KeyError(f"Variant {source} already registered for architecture {architecture}")
-
-    sources[source] = adapter
-    __adapters[architecture] = sources
-
-
-def list_sources(architecture: str):
-    """
-    Lists available sources (attribute formats) of a model architecture.
-    E.g. `models.list_variants('llama')` -> ['meta', 'fms', 'hf']
-    Args:
-    architecture: one of the registered architectures returned by
-                    `models.list_models()`.
-    """
-    if architecture not in __adapters:
-        return []
-    return list(__adapters[architecture].keys())
-
-
-def _get_adapter(architecture: str, source: Optional[str]) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
-    if source is None or architecture not in __adapters or source not in __adapters[architecture]:
-        # if no adapter is registered, assume the attributes are already in
-        # fms format.
-        # should we raise an error here instead?
-        return lambda x: x
-    else:
-        return __adapters[architecture][source]
-
-
-def get_adapted(architecture: str, source: Optional[str], state_dict: Mapping[str, Any]) -> Mapping[str, Any]:
-    """
-    Convert a state dict to FMS format, using an adapter specified by name.
-
-    Args:
-    architecture: one of the architectures from `models.list_models()`.
-                    E.g. llama.
-    source: A reference to an attribute format
-    state_dict: the model.state_dict() to be converted/adapted.
-    """
-    # sometimes we only load onto rank 0 so may not have a state_dict here.
-    if not len(state_dict):
-        return state_dict
-    adapter = _get_adapter(architecture, source)
-    adapted = adapter(state_dict)
-    return adapted
-
-
-def _get_safetensors_item(key, file: Path, device: torch.device) -> torch.Tensor:
-    from safetensors import safe_open  # type: ignore[import-untyped]
-
-    with torch.no_grad():
-        with safe_open(file, framework="pt", device=str(device)) as model_weights:  # type: ignore[attr-defined]
-            return model_weights.get_tensor(key)
-
-
-class LazySafetensorsDict(collections.UserDict):
-    def set_lazy_tensor(self, key, file, device):
-        super().__setitem__(key, lambda: _get_safetensors_item(key, file, device))
-
-    def __getitem__(self, key):
-        lazy_tensor = super().__getitem__(key)
-        if callable(lazy_tensor):
-            lazy_tensor = lazy_tensor()
-            super().__setitem__(key, lazy_tensor)
-        return lazy_tensor
-
-
-def load_state_dict(
-    model_path: Union[str, Path],
-    *,
-    source: Optional[str] = None,
-    distributed_strategy: Optional[str] = None,
-    checkpoint_sharding: Optional[str] = None,
-    initial_device: torch.device = torch.device("cpu"),
-    rank: int = 0,
-    world_size: int = 1,
-) -> MutableMapping[str, Any]:
-    """
-    Validates that the file(s) found at a checkpoint path are compatible with
-    the intended (possibly distributed) use-case, and returns a lazy loading
-    state dict if possible (some formats may not support that).
-
-    If model_path is a directory, it'll try to load models based on the source
-    (e.g. .bin for HF, .pth for Meta), and, if no source is specified or hasn't
-    been registered, it'll try .safetensors, .pth, and .bin.
-
-    Args:
-    model_path: the path to find the weights. If not set, return None.
-    source: If the weights in the state dict didn't come from an FMS model,
-            `source` specifies which conversion function might be needed.
-            See `serialization.list_sources(architecture)`
-    distributed_strategy: the kind of possibly-distributed model in which we
-            intend to load these weights. E.g. tp, fsdp, None. Used for
-            validation.
-    checkpoint_sharding: the sharding format of the checkpoint.
-            E.g. layer, tp, fsdp.
-    initial_device: where the state dict will be loaded if not lazy.
-            If meta, return empty dict.
-    """
-    if model_path is None or initial_device.type == "meta":
-        return {}
-    if checkpoint_sharding == "fsdp" and distributed_strategy not in ["fsdp", "hsdp"]:
-        raise ValueError("FSDP checkpoints can only be loaded into an FSDP model")
-    if checkpoint_sharding == "tp" and distributed_strategy != "tp":
-        raise ValueError("TP checkpoints can only be loaded into a TP model")
-
-    # Before creating the Path object, check if model_path has a glob pattern
-    if isinstance(model_path, str):
-        model_path, sep, glob_pattern = model_path.partition("*")
-    else:
-        sep = ""
-        glob_pattern = ""
-    glob_pattern = sep + glob_pattern
-
-    model_path = Path(os.path.expanduser(model_path))
-
-    checkpoints = []
-
-    if model_path.is_dir():
-        if glob_pattern != "":
-            glob_pattern_list = [glob_pattern]
-        elif source == "meta":
-            glob_pattern_list = ["*.pth", "*.safetensors"]
-        elif source == "hf":
-            glob_pattern_list = ["*.bin", "*.safetensors"]
-        else:
-            glob_pattern_list = ["*.safetensors", "*.pth", "*.bin"]
-        for glob_pattern_possibility in glob_pattern_list:
-            file_list = list(model_path.glob(glob_pattern_possibility))
-            if len(file_list) > 0:
-                checkpoints = sorted(file_list)
-                break
-
-    if model_path.is_file():
-        checkpoints = [model_path]
-
-    # Check if we found some files
-    assert len(checkpoints) > 0, f"Can't find the requested checkpoint data at {model_path}"
-
-    if checkpoint_sharding is not None and checkpoint_sharding != "layer":
-        assert (
-            world_size == len(checkpoints)
-        ), f"Loading a {checkpoint_sharding}-sharded checkpoint with len={len(checkpoints)} but world size is {world_size}"
-
-        checkpoints = [checkpoints[rank]]
-
-    # if there's only one checkpoint for fsdp/hsdp, load it only into rank zero
-    # and it will be distributed by the FSDP `sync_module_states` parameter
-    if checkpoint_sharding is None and distributed_strategy in {"hsdp", "fsdp"}:
-        if rank == 0:
-            checkpoints = [checkpoints[0]]
-        else:
-            return {}
-
-    checkpoint_sds = []
-    if checkpoints[0].suffix == ".safetensors":
-        for ckp in checkpoints:
-            checkpoint_sds.append(
-                _load_safetensors_state_dict(
-                    ckp,
-                    initial_device,
-                )
-            )
-    else:
-        with torch.no_grad():
-            checkpoint_sds = [
-                torch.load(str(ckpt_path), map_location=initial_device, mmap=True) for ckpt_path in checkpoints
-            ]
-    return ChainMap(*checkpoint_sds)
-
-
-def _load_safetensors_state_dict(
-    checkpoint: Path,
-    device: torch.device,
-):
-    sd = LazySafetensorsDict()
-
-    from safetensors import safe_open
-
-    with safe_open(checkpoint, framework="pt", device=str(device)) as model_weights:  # type: ignore[attr-defined]
-        sd_keys = list(model_weights.keys())
-        for key in sd_keys:
-            sd.set_lazy_tensor(key, checkpoint, device)
-    return sd
-
-
-class FusableWeightsMissingError(Exception):
-    missing_weights: List[str] = []
-
-    def __init__(self, missing_weights):
-        self.missing_weights = missing_weights
-        super().__init__()
-
-
-def load_state_dict_into_model(
-    model: torch.nn.Module,
-    state_dict: MutableMapping[str, Any],
-    architecture: str,
-    source: str,
-    distributed_strategy: Optional[str] = None,
-    checkpoint_sharding: Optional[str] = None,
-    initial_device: torch.device = torch.device("cpu"),
-    rank: int = 0,
-    world_size: int = 0,
-) -> None:
-    """
-    This function loads state_dict into model in the most efficient way possible,
-    and it removes all weights that have been used in model from state_dict
-    in order to conserve memory.
-
-    Args:
-    model: The model where the weights are being loaded.
-    state_dict: The dictionary with all the weights. If it has been mmaped
-            (for torch.load) or it is an instance of LazySafetensorsDict,
-            the weights are loaded lazily from disk.
-    architecture: the model architecture, e.g. llama. See `models.list_models()`.
-    source: If the weights in the state dict didn't come from an FMS model,
-            `source` specifies which conversion function might be needed.
-            See `serialization.list_sources(architecture)`
-    distributed_strategy: the kind of possibly-distributed model in which we
-            intend to load these weights. E.g. tp, fsdp, None. Used for weight
-            sharding.
-    checkpoint_sharding: the sharding format of the checkpoint.
-            E.g. layer, tp, fsdp. Used for weight sharding.
-    initial_device: where the weights will be loaded from disk.
-    """
-
-    # 1. Get the adapter from checkpoint sd to fms sd
-    adapter = _get_adapter(architecture, source)
-
-    # 2. Decide if model needs sharding and how (for now only TP)
-    needs_tp_sharding = checkpoint_sharding != "tp" and distributed_strategy == "tp"
-
-    # 3. Iterate over the weights and load them into the model
-    used_keys = set()
-    sd_keys = list(state_dict.keys())
-    with torch.no_grad():
-        for key in sd_keys:
-            if key in used_keys:
-                continue
-            used_keys.add(key)
-            try:
-                partial_sd = {key: state_dict[key]}
-                if partial_sd[key].device != initial_device:
-                    partial_sd[key] = partial_sd[key].to(device=initial_device)
-                fms_partial_sd = adapter(partial_sd)
-            except FusableWeightsMissingError as e:
-                for weight in e.missing_weights:
-                    used_keys.add(weight)
-                    partial_sd[weight] = state_dict[weight]
-                    if partial_sd[weight].device != initial_device:
-                        partial_sd[weight] = partial_sd[weight].to(device=initial_device)
-                fms_partial_sd = adapter(partial_sd)
-            _load_partial_state_dict(model, fms_partial_sd, needs_tp_sharding, rank, world_size)
-            for p_key in partial_sd.keys():
-                if isinstance(state_dict, ChainMap):
-                    for child_sd in state_dict.maps:
-                        child_sd.pop(p_key, None)
-                else:
-                    state_dict.pop(p_key)
-            del partial_sd
-            del fms_partial_sd
-
-
-def _copy_colwise(param: torch.nn.Parameter, tensor_value, is_bias, rank, world_size):
-    """
-    This function copies the correct shard of the weights for a colwise-TP'd module
-    according to the rank of the process and the world_size.
-
-    Args
-    ====
-    param: torch.nn.Parameter
-        Parameter that has had TP applied
-    tensor_value: torch.Tensor
-        tensor that needs sharding
-    rank: int
-        Rank of the current process
-    world_size: int
-        Total number of TP processes
-    """
-    # Divide the weight matrix along the first dimension.
-    output_size_per_partition = param.shape[0]
-    if not is_bias:
-        tensor = tensor_value[
-            (rank * output_size_per_partition) : ((rank + 1) * output_size_per_partition),
-            :,
-        ]
-    else:
-        tensor = tensor_value[(rank * output_size_per_partition) : ((rank + 1) * output_size_per_partition)]
-    param.copy_(tensor, non_blocking=True)
-
-
-def _copy_rowwise(param: torch.nn.Parameter, tensor_value, is_bias, rank, world_size):
-    """
-    This function copies the correct shard of the weights for a rowwise-TP'd module
-    according to the rank of the process and the world_size.
-
-    Args
-    ====
-    param: torch.nn.Parameter
-        Parameter that has had TP applied
-    tensor_value: torch.Tensor
-        tensor that needs sharding
-    rank: int
-        Rank of the current process
-    world_size: int
-        Total number of TP processes
-    """
-    # Divide the weight matrix along the last dimension.
-    if not is_bias:
-        output_size_per_partition = param.shape[1]
-        tensor = tensor_value[
-            :,
-            (rank * output_size_per_partition) : ((rank + 1) * output_size_per_partition),
-        ]
-        param.copy_(tensor, non_blocking=True)
-    else:
-        if rank == 0:
-            _copy_if_present(param, tensor_value)
-        else:
-            param.zero_()
-
-
-def _copy_embedding(param: torch.nn.Parameter, tensor_value, rank, world_size):
-    """
-    This function copies the correct shard of the weights for a TP'd embedding module
-    according to the rank of the process and the world_size.
-
-    Args
-    ====
-    param: torch.nn.Parameter
-        Parameter that has had TP applied
-    tensor_value: torch.Tensor
-        tensor that needs sharding
-    rank: int
-        Rank of the current process
-    world_size: int
-        Total number of TP processes
-    """
-    # Divide the weight matrix along the last dimension.
-    output_size_per_partition = param.shape[1]
-    tensor = tensor_value[
-        :,
-        (rank * output_size_per_partition) : ((rank + 1) * output_size_per_partition),
-    ]
-    param.copy_(tensor, non_blocking=True)
-
-
-def _copy_if_present(parameter, tensor_value):
-    parameter.copy_(tensor_value, non_blocking=True)
-
-
-def _load_partial_state_dict(
-    model: torch.nn.Module,
-    state_dict,
-    needs_tp_sharding: bool,
-    rank=0,
-    world_size=1,
-):
-    unused_params = []
-    for key, tensor_value in state_dict.items():
-        target_module = model
-        # Find where to put the weight and decide whether it needs TP'ing
-        key_steps = key.split(".")
-        prefix = ""
-        key_step = 0
-        tp_module = None
-        # Navigate the model tree to find the module where the parameter is
-        # located and whether there is a TPModule in the way in case the
-        # parameter requires sharding
-        while key_step < len(key_steps) - 1:
-            try:
-                target_module = getattr(target_module, key_steps[key_step])
-                if key_step > 0:
-                    prefix += "."
-                prefix += key_steps[key_step]
-                key_step += 1
-                if isinstance(target_module, Iterable):
-                    target_module = target_module[int(key_steps[key_step])]  # type: ignore[index]
-                    prefix += "." + key_steps[key_step]
-                    key_step += 1
-                if isinstance(target_module, TPModule):
-                    tp_module = target_module
-            except AttributeError:
-                unused_params.append(key)
-                break
-
-        # Check if target_module has the Parameter/buffer
-        try:
-            param = getattr(target_module, key_steps[-1])
-
-            # If TP sharding is not needed, copy the parameter
-            # into the model
-            if not needs_tp_sharding or tp_module is None:
-                _copy_if_present(param, tensor_value)
-            elif tp_module is not None:
-                # Handle TP sharding
-                if key_steps[-2] in tp_module.colwise_param_names():
-                    _copy_colwise(
-                        param,
-                        tensor_value,
-                        key_steps[-1] == "bias",
-                        rank,
-                        world_size,
-                    )
-                if key_steps[-2] in tp_module.rowwise_param_names():
-                    _copy_rowwise(
-                        param,
-                        tensor_value,
-                        key_steps[-1] == "bias",
-                        rank,
-                        world_size,
-                    )
-                if key_steps[-2] in tp_module.embedding_param_names():
-                    _copy_embedding(
-                        param,
-                        tensor_value,
-                        rank,
-                        world_size,
-                    )
-        except AttributeError:
-            unused_params.append(key)
diff --git a/optimum/habana/distributed/strategy.py b/optimum/habana/distributed/strategy.py
deleted file mode 100644
index 91b3f00232..0000000000
--- a/optimum/habana/distributed/strategy.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2024 The Foundation Model Stack Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This file has been modified from its original version.
-# The original version can be found at https://github.com/foundation-model-stack/foundation-model-stack
-
-from abc import abstractmethod
-from typing import List
-
-import torch
-import torch.distributed
-from torch import nn
-
-
-class DistributedStrategy:
-    def __init__(self, from_meta=False):
-        self.from_meta = from_meta
-
-    def distribute_module(self, module: nn.Module, final_layers: bool = False) -> nn.Module:
-        """
-        Optionally a distributed strategy may distribute modules that are not
-        numbered layers
-        """
-        return module
-
-    @abstractmethod
-    def distribute_layer(self, block: nn.Module, layer: int) -> nn.Module:
-        """
-        Distribute each layer as-appropriate
-        """
-        pass
-
-
-class NotDistributed(DistributedStrategy):
-    def __init__(self, from_meta=False):
-        super().__init__(from_meta)
-
-    def distribute_module(self, module: nn.Module, final_layers: bool = False) -> nn.Module:
-        return module
-
-    def distribute_layer(self, block: nn.Module, layer: int) -> nn.Module:
-        return block
-
-
-NoOpStrategy = NotDistributed()
-
-
-class DeviceMover(nn.Module):
-    def __init__(self, module: nn.Module, device):
-        super().__init__()
-        self.device = device
-        # make this wrapper module behave as if it was the wrapped module.
-        attr = module.__dict__
-        attr["module"] = module.to(device)
-        attr["device"] = device
-        self.__dict__ = attr
-
-    def forward(self, *args, **kwargs):
-        device = self.device
-        args = [arg.to(device) if isinstance(arg, torch.Tensor) else arg for arg in args]
-        kwargs = {k: (kwargs[k].to(device) if isinstance(kwargs[k], torch.Tensor) else kwargs[k]) for k in kwargs}
-        return self.module(*args, **kwargs)
-
-
-class UniformModelParallelStrategy(DistributedStrategy):
-    def __init__(self, devices: List[int], num_layers: int, from_meta=False):
-        super().__init__(from_meta)
-        num_dev = len(devices)
-        layers_per_dev = num_layers // num_dev
-        remainder = num_layers - (layers_per_dev * num_dev)
-        self.layer_to_device = [0] * num_layers
-        layer_id = 0
-        for dev_idx in range(len(devices)):
-            for i in range(layers_per_dev):
-                self.layer_to_device[layer_id] = devices[dev_idx]
-                layer_id = layer_id + 1
-            if remainder > 0:
-                self.layer_to_device[layer_id] = devices[dev_idx]
-                layer_id = layer_id + 1
-                remainder -= 1
-
-    def distribute_layer(self, block: nn.Module, layer: int) -> nn.Module:
-        device = self.layer_to_device[layer]
-        if self.from_meta:
-            block.to_empty(device=device)  # type: ignore[arg-type]
-        wrapped = DeviceMover(block, device)
-        return wrapped
-
-    def distribute_module(self, module: nn.Module, final_layers: bool = False) -> nn.Module:
-        if final_layers:
-            device = self.layer_to_device[len(self.layer_to_device) - 1]
-        else:
-            device = self.layer_to_device[0]
-        if self.from_meta:
-            return module.to_empty(device=device)  # type: ignore[arg-type]
-        wrapped = DeviceMover(module, device)
-        return wrapped
-
-
-class TensorParallelStrategy(DistributedStrategy):
-    def __init__(self, group=None, from_meta=False):
-        super().__init__(from_meta)
-        assert torch.distributed.is_initialized(), "must initialize a process group"
-        self.group = group if group is not None else torch.distributed.GroupMember.WORLD
-
-    def distribute_module(self, module: nn.Module, final_layers: bool = False) -> nn.Module:
-        from optimum.habana.distributed import tp_wrapping
-
-        return tp_wrapping.apply_tp(module, self.group)
-
-    def distribute_layer(self, block: nn.Module, layer: int) -> nn.Module:
-        from optimum.habana.distributed import tp_wrapping
-
-        return tp_wrapping.apply_tp(block, layer, self.group)
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["group"] = None  # Remove ProcessGroup from state
-        return state
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-        self.group = None  # Restore to default state or reinitialize
diff --git a/optimum/habana/distributed/tensorparallel.py b/optimum/habana/distributed/tensorparallel.py
deleted file mode 100644
index ba423175b4..0000000000
--- a/optimum/habana/distributed/tensorparallel.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2024 The Foundation Model Stack Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This file has been modified from its original version.
-# The original version can be found at https://github.com/foundation-model-stack/foundation-model-stack
-
-
-import torch
-import torch._inductor.ir as ir
-import torch._inductor.lowering as lowering
-import torch.distributed as dist
-from torch import nn
-
-#This needs to be fixed Issues can be tracked at - SW-192548
-def disable_compiler(fn):
-    if hasattr(torch, "compiler") and hasattr(torch.nn.Module, "compile"):
-        return torch.compiler.disable(fn)
-    return fn
-
-def apply_colwise_tp(par_mod: nn.Linear, mod: nn.Linear, world_size, rank):
-    # Divide the weight matrix along the last dimension.
-    output_size_per_partition = mod.out_features // world_size
-    with torch.no_grad():
-        par_mod.weight.copy_(
-            torch.split(mod.weight, output_size_per_partition, dim=0)[rank]
-        )
-        if par_mod.bias is not None:
-            par_mod.bias.copy_(torch.split(mod.bias, output_size_per_partition)[rank])
-
-
-def apply_rowwise_tp(par_mod: nn.Linear, mod: nn.Linear, world_size, rank):
-    # Divide the weight matrix along the last dimension.
-    output_size_per_partition = mod.in_features // world_size
-    with torch.no_grad():
-        par_mod.weight.copy_(
-            torch.split(mod.weight, output_size_per_partition, dim=1)[rank]
-        )
-        if par_mod.bias is not None:
-            if rank == 0:
-                par_mod.bias.copy_(mod.bias)
-            else:
-                par_mod.bias.zero_()
-
-
-def apply_embedding_tp(par_mod: nn.Embedding, mod: nn.Embedding, world_size, rank):
-    # Divide the weight matrix along the last dimension.
-    output_size_per_partition = mod.embedding_dim // world_size
-    with torch.no_grad():
-        par_mod.weight.copy_(
-            torch.split(mod.weight, output_size_per_partition, dim=1)[rank]
-        )
-
-
-## Fixes for PT 2.2 collectives until PT 2.3 is released
-
-
-# Fix 1: https://github.com/pytorch/pytorch/issues/121311
-def get_volatile_reads_fixed(self):
-    inp = self.inputs[0]
-    if isinstance(inp, ir._CollectiveKernel):
-        # Out-of-place single-output
-        return [inp.inputs[0]]
-    elif isinstance(inp, ir.MultiOutput):
-        # Out-of-place multi-output
-        coll = inp.inputs[0]
-        if isinstance(coll, ir._CollectiveKernel):
-            _, idx = inp.indices[0]
-            return [coll.inputs[idx]]
-        return []  # e.g. regular FallbackKernel
-    else:
-        # In-place requires no additional deps handling for volatile
-        # reads since the inputs are mutated.
-        return []
-
-
-ir._WaitKernel.get_volatile_reads = get_volatile_reads_fixed
-
-# Fix 2: These are fixed already in nightlies and will be in 2.3
-for overload in torch.ops._c10d_functional.all_reduce.overloads():
-    other_fn = getattr(torch.ops._c10d_functional.all_reduce, overload)
-    if other_fn in lowering.lowerings:
-        del lowering.lowerings[other_fn]
-
-@disable_compiler
-def _all_reduce(input_: torch.Tensor) -> torch.Tensor:
-    """All-reduce the input tensor across model parallel group."""
-    world_size = dist.get_world_size()
-
-    if world_size == 1:
-        return input_
-
-    # Starting PT 2.3, we can go back to funcol.all_reduce
-    return torch.ops._c10d_functional.wait_tensor(
-        torch.ops._c10d_functional.all_reduce(input_, "sum", "default")
-    )
-
-class _ReduceFromModelParallelRegion(torch.autograd.Function):
-    """All-reduce the input from the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _all_reduce(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _all_reduce(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output
-
-def reduce_from_tensor_model_parallel_region(input_):
-    return _ReduceFromModelParallelRegion.apply(input_)
diff --git a/optimum/habana/distributed/tp.py b/optimum/habana/distributed/tp.py
deleted file mode 100644
index c4f156fa61..0000000000
--- a/optimum/habana/distributed/tp.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2024 The Foundation Model Stack Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This file has been modified from its original version.
-# The original version can be found at https://github.com/foundation-model-stack/foundation-model-stack
-
-import itertools
-from abc import ABCMeta, abstractmethod
-from typing import List
-
-import torch
-import torch.nn as nn
-from torch.distributed.distributed_c10d import ProcessGroup
-
-from .tensorparallel import (
-    apply_colwise_tp,
-    apply_embedding_tp,
-    apply_rowwise_tp,
-)
-
-
-class TPModule(nn.Module, metaclass=ABCMeta):
-    """
-    This is an abstract class that any nn.Module can implement to enable
-    Tensor Parallel. On top of inheriting from this class, the TP module
-    will have to implement list_colwise_weights, list_rowwise_weights,
-    list_embedding_weights, and import_module for their relevant weights.
-    Finally, the module must call setup_tp at the end of their __init__
-    function. See examples in attention.py, feedforward.py and embedding.py
-
-    """
-
-    rank: int
-    world_size: int
-
-    def setup_tp(self, rank: int, world_size: int) -> None:
-        self.rank = rank
-        self.world_size = world_size
-
-    def colwise_param_names(self) -> List[str]:
-        return []
-
-    def rowwise_param_names(self) -> List[str]:
-        return []
-
-    def embedding_param_names(self) -> List[str]:
-        return []
-
-    @staticmethod
-    @abstractmethod
-    def import_module(module, group: ProcessGroup):
-        pass
-
-    def import_weights(self, module: nn.Module):
-        for weight in self.colwise_param_names():
-            apply_colwise_tp(
-                getattr(self, weight),
-                getattr(module, weight),
-                self.world_size,
-                self.rank,
-            )
-        for weight in self.rowwise_param_names():
-            apply_rowwise_tp(
-                getattr(self, weight),
-                getattr(module, weight),
-                self.world_size,
-                self.rank,
-            )
-        for weight in self.embedding_param_names():
-            apply_embedding_tp(
-                getattr(self, weight),
-                getattr(module, weight),
-                self.world_size,
-                self.rank,
-            )
-        tp_sharded_modules = list(
-            itertools.chain(
-                self.colwise_param_names(),
-                self.rowwise_param_names(),
-                self.embedding_param_names(),
-            )
-        )
-        with torch.no_grad():
-            for mod_name, module in self.named_children():
-                if mod_name not in tp_sharded_modules:
-                    for param_name, param in module.named_parameters(recurse=False):
-                        param.copy_(
-                            getattr(getattr(module, mod_name), param_name),
-                            non_blocking=True,
-                        )
diff --git a/optimum/habana/distributed/tp_wrapping.py b/optimum/habana/distributed/tp_wrapping.py
deleted file mode 100644
index 761fa7bff4..0000000000
--- a/optimum/habana/distributed/tp_wrapping.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2024 The Foundation Model Stack Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This file has been modified from its original version.
-# The original version can be found at https://github.com/foundation-model-stack/foundation-model-stack
-
-from torch import nn
-from torch.distributed.distributed_c10d import ProcessGroup
-
-from ..transformers.models.llama.modeling_llama import (
-    GaudiLlamaAttention,
-    GaudiLlamaMLP,
-    TPGaudiLlamaAttention,
-    TPGaudiLlamaMLP,
-)
-
-
-def _tp_wrapped(module: nn.Module, layer: int, group: ProcessGroup):
-    if hasattr(module, "to_tp"):
-        return module.to_tp(group)
-    elif isinstance(module, GaudiLlamaAttention):
-        return TPGaudiLlamaAttention.import_module(module, layer, group)
-    elif isinstance(module, GaudiLlamaMLP):
-        return TPGaudiLlamaMLP.import_module(module, group)
-    else:
-        return module
-
-
-def apply_tp(model: nn.Module, layer_idx: int, group: ProcessGroup):
-    wrapped = _tp_wrapped(model, layer_idx, group)
-    if wrapped is not model:
-        return wrapped
-
-    for name, layer in model.named_children():
-        tp_layer = apply_tp(layer, layer_idx, group)
-        setattr(model, name, tp_layer)
-    return model
diff --git a/optimum/habana/peft/__init__.py b/optimum/habana/peft/__init__.py
deleted file mode 100644
index 3030f74e89..0000000000
--- a/optimum/habana/peft/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .layer import GaudiAdaloraLayerSVDLinearForward
-from .peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation
diff --git a/optimum/habana/peft/layer.py b/optimum/habana/peft/layer.py
deleted file mode 100755
index dacafb1155..0000000000
--- a/optimum/habana/peft/layer.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from typing import Any
-
-import torch
-from peft.utils.other import transpose
-
-
-def GaudiAdaloraLayerSVDLinearForward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
-    """
-    Copied from SVDLinear.forward: https://github.com/huggingface/peft/blob/v0.9.0/src/peft/tuners/adalora/layer.py#L158
-    The only differences are:
-    - fix batch_gemm failure for BF16 case
-    """
-    if self.disable_adapters:
-        if self.merged:
-            self.unmerge()
-        result = self.base_layer(x, *args, **kwargs)
-    elif self.merged:
-        result = self.base_layer(x, *args, **kwargs)
-    else:
-        result = self.base_layer(x, *args, **kwargs)
-        for active_adapter in self.active_adapters:
-            if active_adapter not in self.lora_A.keys():
-                continue
-            lora_A = self.lora_A[active_adapter]
-            lora_B = self.lora_B[active_adapter]
-            lora_E = self.lora_E[active_adapter]
-            dropout = self.lora_dropout[active_adapter]
-            scaling = self.scaling[active_adapter]
-            ranknum = self.ranknum[active_adapter] + 1e-5
-
-            x = x.to(lora_A.dtype)
-            result += (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * (scaling / ranknum)
-
-    return result
-
-
-class LoRALinear:
-    def __init__(self, module):
-        has_bias = module.bias is not None
-        self.module = module
-        import habana_frameworks.torch.hpex.experimental.transformer_engine as te
-
-        self.module.te_linear = te.Linear(
-            module.in_features,
-            module.out_features,
-            bias=has_bias,
-            params_dtype=module.weight.dtype,
-            skip_weight_param_allocation=True,
-        )
-
-    def _linear(self, input: torch.Tensor) -> torch.Tensor:
-        # TODO: to check if bias is removed from lora linear
-        if hasattr(self.module, "bias"):
-            return self.module.te_linear(
-                input, transpose(self.module.weight, self.module.fan_in_fan_out), bias=self.module.bias
-            )
-        else:
-            return self.module.te_linear(input, transpose(self.module.weight, self.module.fan_in_fan_out))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        previous_dtype = x.dtype
-
-        if self.module.disable_adapters:
-            if self.module.merged:
-                self.module.unmerge()
-            result = self._linear(x)
-        elif self.module.merged:
-            result = self._linear(x)
-        else:
-            result = self._linear(x)
-            for active_adapter in self.module.active_adapters:
-                if active_adapter not in self.module.lora_A.keys():
-                    continue
-                lora_A = self.module.lora_A[active_adapter]
-                lora_B = self.module.lora_B[active_adapter]
-                dropout = self.module.lora_dropout[active_adapter]
-                scaling = self.module.scaling[active_adapter]
-                x = x.to(lora_A.weight.dtype)
-                result = result.clone() + lora_B(lora_A(dropout(x))) * scaling
-
-        result = result.to(previous_dtype)
-        return result
-
-    @staticmethod
-    def replace_forward(module):
-        lora_linear = LoRALinear(module)
-        module.forward = lora_linear.forward
diff --git a/optimum/habana/peft/peft_model.py b/optimum/habana/peft/peft_model.py
deleted file mode 100644
index 1fabf4034c..0000000000
--- a/optimum/habana/peft/peft_model.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import warnings
-
-import packaging.version
-import torch
-import transformers
-from peft import PeftType
-
-
-def gaudi_generate(self, *args, **kwargs):
-    peft_config = self.active_peft_config
-    self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
-    if hasattr(self.base_model, "model"):
-        self.base_model.model.generation_config = self.generation_config
-    else:
-        self.base_model.generation_config = self.generation_config
-    try:
-        if not peft_config.is_prompt_learning:
-            with self._enable_peft_forward_hooks(*args, **kwargs):
-                kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
-                outputs = self.base_model.generate(*args, **kwargs)
-        else:
-            kwargs["num_virtual_tokens"] = peft_config.num_virtual_tokens
-            outputs = self.base_model.generate(**kwargs)
-    except:
-        self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
-        raise
-    else:
-        self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
-        return outputs
-
-
-def gaudi_prepare_inputs_for_generation(self, *args, task_ids: torch.Tensor = None, **kwargs):
-    """
-    Copied from PeftModelForCausalLM.prepare_inputs_for_generation: https://github.com/huggingface/peft/blob/v0.9.0/src/peft/peft_model.py#L1156
-    The only differences are:
-    - add token_idx disposal for prompt learning
-    """
-    peft_config = self.active_peft_config
-    model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
-    # https://github.com/huggingface/transformers/pull/26681/ introduced new cache format
-    # for some architectures which requires a special fix for prompt tuning etc.
-    # TODO: starting with transformers 4.38, all architectures should support caching.
-    uses_transformers_4_38 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.38.0")
-    uses_transformers_4_36 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.36.0")
-    transformers_new_cache_archs = ["llama", "mistral", "persimmon", "phi"]
-    uses_cache = uses_transformers_4_38 or (
-        uses_transformers_4_36 and self.base_model.config.model_type in transformers_new_cache_archs
-    )
-    if peft_config.peft_type == PeftType.POLY:
-        model_kwargs["task_ids"] = task_ids
-
-    if peft_config.is_prompt_learning:
-        if uses_cache and (model_kwargs["past_key_values"] is not None):
-            # change in the logic of `prepare_inputs_for_generation` makes the below code necessary
-            # In prompt learning methods, past key values are longer when compared to the `input_ids`.
-            # As such only consider the last input ids in the autogressive generation phase.
-            if model_kwargs.get("reuse_cache", False):
-                if model_kwargs["past_key_values"][0][0][-2] >= model_kwargs["input_ids"].shape[1]:
-                    model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:]
-            else:
-                if model_kwargs["past_key_values"][0][0].shape[-2] >= model_kwargs["input_ids"].shape[1]:
-                    model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:]
-
-        if model_kwargs.get("attention_mask", None) is not None:
-            size = model_kwargs["input_ids"].shape[0], peft_config.num_virtual_tokens
-            prefix_attention_mask = torch.ones(size).to(model_kwargs["input_ids"].device)
-            model_kwargs["attention_mask"] = torch.cat((prefix_attention_mask, model_kwargs["attention_mask"]), dim=1)
-
-        token_idx = model_kwargs.get("token_idx", None)
-        if token_idx is not None:
-            token_idx = token_idx + peft_config.num_virtual_tokens
-
-        token_idx_cpu = model_kwargs.get("token_idx_cpu", None)
-        if token_idx_cpu is not None:
-            token_idx_cpu = token_idx_cpu + peft_config.num_virtual_tokens
-            model_kwargs["token_idx_cpu"] = token_idx_cpu
-
-        if model_kwargs.get("position_ids", None) is not None and token_idx is None:
-            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
-            model_kwargs["position_ids"] = None
-
-        if kwargs.get("token_type_ids", None) is not None:
-            warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
-            kwargs["token_type_ids"] = None
-
-        if model_kwargs["past_key_values"] is None and peft_config.peft_type == PeftType.PREFIX_TUNING:
-            past_key_values = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0])
-            model_kwargs["past_key_values"] = past_key_values
-        else:
-            if model_kwargs["past_key_values"] is None:
-                inputs_embeds = self.word_embeddings(model_kwargs["input_ids"])
-                prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0], task_ids=task_ids)
-                prompts = prompts.to(inputs_embeds.dtype)
-                model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1)
-                model_kwargs["input_ids"] = None
-        if token_idx is not None:
-            attention_mask = model_kwargs["attention_mask"]
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if peft_config.peft_type == PeftType.PREFIX_TUNING and model_kwargs["input_ids"].shape[-1] != 1:
-                position_ids = position_ids[:, -model_kwargs["input_ids"].shape[-1] :]
-            if model_kwargs["past_key_values"] is not None and model_kwargs["input_ids"].shape[-1] == 1:
-                position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-            model_kwargs["position_ids"] = position_ids
-            model_kwargs["token_idx"] = token_idx
-    # For transformers>=4.38.0 - for some architectures such as Llama, `cache_position` is
-    # passed in the forward pass to keep track of the position ids of the cache. We have to
-    # pop that from `model_kwargs` as `cache_position` is properly created by the model, using the passed
-    # `inputs_embeds`: https://github.com/huggingface/transformers/blob/593230f0a1150ea9c0477b9d859f25daf73c8c33/src/transformers/models/llama/modeling_llama.py#L956
-    _ = model_kwargs.pop("cache_position", None)
-
-    return model_kwargs
diff --git a/optimum/habana/transformers/__init__.py b/optimum/habana/transformers/__init__.py
deleted file mode 100644
index 82791b64cf..0000000000
--- a/optimum/habana/transformers/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-#  Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from .gaudi_configuration import GaudiConfig
-from .trainer import GaudiTrainer
-from .trainer_seq2seq import GaudiSeq2SeqTrainer
-from .training_args import GaudiTrainingArguments
-from .training_args_seq2seq import GaudiSeq2SeqTrainingArguments
diff --git a/optimum/habana/transformers/gaudi_configuration.py b/optimum/habana/transformers/gaudi_configuration.py
deleted file mode 100644
index 76638d8e95..0000000000
--- a/optimum/habana/transformers/gaudi_configuration.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-from pathlib import Path
-
-from optimum.configuration_utils import BaseConfig
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-# Default bf16 and fp32 ops (BERT)
-DEFAULT_BF16_OPS = [
-    "add",
-    "addmm",
-    "bmm",
-    "div",
-    "dropout",
-    "gelu",
-    "iadd",
-    "linear",
-    "layer_norm",
-    "matmul",
-    "mm",
-    "rsub",
-    "softmax",
-    "truediv",
-]
-DEFAULT_FP32_OPS = [
-    "embedding",
-    "nll_loss",
-    "log_softmax",
-]
-GAUDI_CONFIG_NAME = "gaudi_config.json"
-
-
-class GaudiConfig(BaseConfig):
-    CONFIG_NAME = "gaudi_config.json"
-    FULL_CONFIGURATION_FILE = "gaudi_config.json"
-
-    def __init__(self, **kwargs):
-        # Torch Autocast
-        self.use_torch_autocast = kwargs.pop("use_torch_autocast", False)
-        self.autocast_bf16_ops = kwargs.pop("autocast_bf16_ops", None)
-        self.autocast_fp32_ops = kwargs.pop("autocast_fp32_ops", None)
-        self.use_dynamic_shapes = kwargs.pop("use_dynamic_shapes", False)
-
-        # Use Habana's custom AdamW implementation
-        self.use_fused_adam = kwargs.pop("use_fused_adam", False)
-        # Use Habana's custom fused clip norm implementation
-        self.use_fused_clip_norm = kwargs.pop("use_fused_clip_norm", False)
-
-        # TODO: to remove in a future version
-
-    def write_bf16_fp32_ops_to_text_files(
-        self,
-        path_to_bf16_file: Path,
-        path_to_fp32_file: Path,
-    ):
-        for path, ops in zip(
-            [Path(path_to_bf16_file), Path(path_to_fp32_file)], [self.autocast_bf16_ops, self.autocast_fp32_ops]
-        ):
-            with path.open("w") as text_file:
-                # writelines does not add new lines after each element so "\n" is inserted
-                text_file.writelines(op + "\n" for op in ops)
-
-    def declare_autocast_bf16_fp32_ops(self):
-        if self.autocast_bf16_ops is not None and self.autocast_fp32_ops is not None:
-            if "habana_frameworks.torch.core" in sys.modules:
-                raise RuntimeError(
-                    "Setting bf16/fp32 ops for Torch Autocast but `habana_frameworks.torch.core` has already been imported. "
-                    "You should instantiate your Gaudi config and your training arguments before importing from `habana_frameworks.torch` or calling a method from `optimum.habana.utils`."
-                )
-            else:
-                autocast_bf16_filename = "/tmp/lower_list.txt"
-                autocast_fp32_filename = "/tmp/fp32_list.txt"
-
-                self.write_bf16_fp32_ops_to_text_files(
-                    autocast_bf16_filename,
-                    autocast_fp32_filename,
-                )
-                os.environ["LOWER_LIST"] = autocast_bf16_filename
-                os.environ["FP32_LIST"] = autocast_fp32_filename
diff --git a/optimum/habana/transformers/generation/__init__.py b/optimum/habana/transformers/generation/__init__.py
deleted file mode 100644
index 6b43ee2ae3..0000000000
--- a/optimum/habana/transformers/generation/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .candidate_generator import GaudiAssistedCandidateGenerator
-from .configuration_utils import GaudiGenerationConfig
-from .stopping_criteria import (
-    gaudi_EosTokenCriteria_call,
-    gaudi_MaxLengthCriteria_call,
-    gaudi_MaxNewTokensCriteria_call,
-    gaudi_MaxTimeCriteria_call,
-    gaudi_StoppingCriteriaList_call,
-)
-from .utils import MODELS_OPTIMIZED_WITH_STATIC_SHAPES, GaudiGenerationMixin
diff --git a/optimum/habana/transformers/generation/candidate_generator.py b/optimum/habana/transformers/generation/candidate_generator.py
deleted file mode 100644
index 633e39e9d6..0000000000
--- a/optimum/habana/transformers/generation/candidate_generator.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import inspect
-from typing import TYPE_CHECKING, Dict, Optional
-
-import torch
-from transformers.generation.candidate_generator import (
-    AssistedCandidateGenerator,
-)
-
-
-if TYPE_CHECKING:
-    from transformers.modeling_utils import PreTrainedModel
-    from transfromers.generation.logits_process import LogitsProcessorList
-
-    from .configuration_utils import GaudiGenerationConfig
-
-
-class GaudiAssistedCandidateGenerator(AssistedCandidateGenerator):
-    def __init__(
-        self,
-        input_ids: torch.LongTensor,
-        assistant_model: "PreTrainedModel",
-        generation_config: "GaudiGenerationConfig",
-        logits_processor: "LogitsProcessorList",
-        model_kwargs: Dict,
-        inputs_tensor: Optional[torch.Tensor] = None,
-    ):
-        super().__init__(
-            input_ids,
-            assistant_model,
-            generation_config,
-            logits_processor,
-            model_kwargs,
-            inputs_tensor,
-        )
-
-        # Remove model kwargs that are specific to optimized models
-        # E.g. token_idx, use_flash_attention, etc...
-        # Otherwise it will trigger an error in GenerationMixin._validate_model_kwargs
-        # TODO: may need to complete this for encoder-decoders: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/generation/utils.py#L1133
-        model_args = set(inspect.signature(assistant_model.prepare_inputs_for_generation).parameters)
-        if "kwargs" in model_args or "model_kwargs" in model_args:
-            model_args |= set(inspect.signature(assistant_model.forward).parameters)
-        for key, value in list(self.assistant_kwargs.items()):
-            if value is not None and key not in model_args:
-                del self.assistant_kwargs[key]
diff --git a/optimum/habana/transformers/generation/configuration_utils.py b/optimum/habana/transformers/generation/configuration_utils.py
deleted file mode 100644
index ce38a07ed9..0000000000
--- a/optimum/habana/transformers/generation/configuration_utils.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from transformers.generation import GenerationConfig
-
-
-class GaudiGenerationConfig(GenerationConfig):
-    """
-    This class extends [`transformers.generation.GenerationConfig`](https://github.com/huggingface/transformers/blob/main/src/transformers/generation/configuration_utils.py)
-    to add HPU-specific arguments for generation.
-
-    Arg:
-    trim_logit (`bool`, *optional):
-        Calculate logits only for the last token to save memory in the first step.
-    static_shapes (`bool`, *optional*):
-        Whether to use static shapes for generation or not. It will run faster on HPUs with static shapes
-        but not all models support it. If not specified, it will automatically be set to `True` if the given
-        model supports it.
-    ignore_eos (`bool`, *optional*):
-        Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
-        If not specified, it will automatically be set to `True` if lazy mode is on.
-    attn_softmax_bf16 (`bool`, *optional*):
-        Whether to run attention softmax layer in lower precision provided that the model supports it and
-        is also running in lower precision.
-    limit_hpu_graphs (`bool`, *optional*):
-        Skip HPU Graph usage for first token to save memory
-    reuse_cache (`bool`, *optional*):
-        Whether to reuse key/value cache for decoding. It should save memory.
-    bucket_size (`int`, *optional*):
-        If negative (default=-1) pad to max if `static_shapes` is set. Else start with
-        `shape = bucket_size * ceil(prompt_len/bucket_size)` and then grow space by `bucket_size` when needed.
-        Only active if `static_shapes` is used. Can't be used with `reuse_cache`.
-    bucket_internal (`bool`, *optional*):
-        Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.
-    use_flash_attention (`bool`, *optional*):
-        Whether to use flash attention optimization.
-    flash_attention_recompute (`bool`, *optional*):
-        Whether to enable recompute if use Habana flash attention.
-    flash_attention_causal_mask (`bool`, *optional*):
-        Whether to enable causal_mask if use Habana flash attention.
-    flash_attention_fast_softmax_mode (`bool`, *optional*):
-        Whether to use fast softmax with reduced precision if use Habana flash attention.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.trim_logits = kwargs.get("trim_logits", None)
-        self.static_shapes = kwargs.get("static_shapes", None)
-        self.ignore_eos = kwargs.get("ignore_eos", None)
-        self.attn_softmax_bf16 = kwargs.get("attn_softmax_bf16", None)
-        self.limit_hpu_graphs = kwargs.get("limit_hpu_graphs", None)
-        self.reuse_cache = kwargs.get("reuse_cache", None)
-        self.bucket_size = kwargs.get("bucket_size", -1)
-        self.bucket_internal = kwargs.get("bucket_internal", None)
-        self.reduce_recompile = kwargs.get("reduce_recompile", None)
-        self.use_flash_attention = kwargs.get("use_flash_attention", None)
-        self.flash_attention_recompute = kwargs.get("flash_attention_recompute", None)
-        self.flash_attention_causal_mask = kwargs.get("flash_attention_causal_mask", None)
-        self.flash_attention_fast_softmax = kwargs.get("flash_attention_fast_softmax", None)
-        self.use_fused_rope = kwargs.get("use_fused_rope", None)
diff --git a/optimum/habana/transformers/generation/stopping_criteria.py b/optimum/habana/transformers/generation/stopping_criteria.py
deleted file mode 100644
index dac7aadd92..0000000000
--- a/optimum/habana/transformers/generation/stopping_criteria.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-from typing import Union
-
-import torch
-
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-# Instead of returning a tensor describing status of completeness of each sentence
-# we only return a single boolean describing the state of the batch
-# only when needs_tensor_output says so, we return array of booleans
-
-
-def create_return_const_tensor(input_ids, is_done):
-    return torch.full((input_ids.shape[0],), 1 if is_done else 0, device=input_ids.device, dtype=torch.uint8)
-
-
-def gaudi_MaxLengthCriteria_call(
-    self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-) -> Union[torch.BoolTensor, bool]:
-    token_idx = kwargs.get("token_idx", None)
-    if token_idx is not None:
-        assert not kwargs["needs_tensor_output"]
-        return token_idx >= self.max_length
-    else:
-        cur_len = input_ids.shape[-1]
-        is_done = cur_len >= self.max_length
-        if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
-            logger.warning_once(
-                "This is a friendly reminder - the current text generation call will exceed the model's predefined "
-                f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe "
-                "exceptions, performance degradation, or nothing at all."
-            )
-        return create_return_const_tensor(input_ids, is_done)
-
-
-def gaudi_MaxNewTokensCriteria_call(
-    self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-) -> Union[torch.BoolTensor, bool]:
-    token_idx = kwargs.get("token_idx", None)
-    if token_idx is not None:
-        assert not kwargs["needs_tensor_output"]
-        return token_idx >= self.max_length
-    else:
-        is_done = input_ids.shape[-1] >= self.max_length
-        return create_return_const_tensor(input_ids, is_done)
-
-
-def gaudi_MaxTimeCriteria_call(
-    self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-) -> Union[torch.BoolTensor, bool]:
-    is_done = time.time() - self.initial_timestamp > self.max_time
-    if kwargs["needs_tensor_output"]:
-        return create_return_const_tensor(input_ids, is_done)
-    else:
-        return is_done
-
-
-def gaudi_EosTokenCriteria_call(
-    self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-) -> Union[torch.BoolTensor, bool]:
-    self.eos_token_id = self.eos_token_id.to(input_ids.device)
-    token_idx = kwargs.get("token_idx", None)
-    if token_idx is not None:
-        assert not kwargs["needs_tensor_output"]
-        is_done = torch.isin(input_ids[:, token_idx - 1], self.eos_token_id)
-    else:
-        is_done = torch.isin(input_ids[:, -1], self.eos_token_id)
-    if kwargs["needs_tensor_output"]:
-        return is_done.byte()
-    else:
-        return torch.all(is_done).item()
-
-
-def needs_tensor_output(token_idx, ignore_eos, eos_token_id) -> bool:
-    if token_idx is None:
-        return not ignore_eos and eos_token_id is not None
-    else:
-        # token_idx is present, so we have static shapes, so using single boolean
-        return False
-
-
-def gaudi_StoppingCriteriaList_call(
-    self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-) -> Union[torch.BoolTensor, bool]:
-    kwargs["needs_tensor_output"] = needs_tensor_output(
-        kwargs.get("token_idx", None), kwargs.get("ignore_eos", True), kwargs.get("eos_token_id", None)
-    )
-    is_done = (
-        torch.full((input_ids.shape[0],), 0, device=input_ids.device, dtype=torch.int8)
-        if kwargs["needs_tensor_output"]
-        else False
-    )
-    for criteria in self:
-        is_done = is_done | criteria(input_ids, scores, **kwargs)
-    return is_done
diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
deleted file mode 100755
index a35d93b5e5..0000000000
--- a/optimum/habana/transformers/generation/utils.py
+++ /dev/null
@@ -1,4161 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import inspect
-import math
-import time
-import warnings
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.distributed as dist
-from transformers.generation.beam_constraints import DisjunctiveConstraint, PhrasalConstraint
-from transformers.generation.beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
-from transformers.generation.candidate_generator import (
-    CandidateGenerator,
-    PromptLookupCandidateGenerator,
-    _crop_past_key_values,
-    _prepare_attention_mask,
-    _prepare_token_type_ids,
-)
-from transformers.generation.logits_process import LogitsProcessorList
-from transformers.generation.stopping_criteria import (
-    EosTokenCriteria,
-    MaxLengthCriteria,
-    MaxTimeCriteria,
-    StoppingCriteriaList,
-    validate_stopping_criteria,
-)
-from transformers.generation.utils import (
-    NEED_SETUP_CACHE_CLASSES_MAPPING,
-    GenerateBeamDecoderOnlyOutput,
-    GenerateBeamEncoderDecoderOutput,
-    GenerateBeamOutput,
-    GenerateDecoderOnlyOutput,
-    GenerateEncoderDecoderOutput,
-    GenerateNonBeamOutput,
-    GenerateOutput,
-    GenerationMixin,
-    GenerationMode,
-    _split_model_inputs,
-    _split_model_outputs,
-    stack_model_outputs,
-)
-from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-from transformers.utils import ModelOutput, is_torchdynamo_compiling
-
-from optimum.utils import logging
-
-from ...utils import HabanaGenerationtime, HabanaProfile
-from ..integrations.deepspeed import unwrap_deepspeed_model
-from .candidate_generator import GaudiAssistedCandidateGenerator
-from .configuration_utils import GaudiGenerationConfig
-
-
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from transformers.streamers import BaseStreamer
-
-    from .candidate_generator import GaudiCandidateGenerator
-
-
-MODELS_OPTIMIZED_WITH_STATIC_SHAPES = [
-    "bloom",
-    "gpt2",
-    "opt",
-    "gptj",
-    "gpt_neox",
-    "llama",
-    "falcon",
-    "codegen",
-    "gpt_bigcode",
-    "bart",
-    "mpt",
-    "t5",
-    "mistral",
-    "phi",
-    "mixtral",
-    "gemma",
-    "blip_text_model",
-    "seamless_m4t",
-    "starcoder2",
-    "persimmon",
-    "qwen2",
-    "llava",
-    "llava_next",
-    "stablelm",
-]
-
-
-logger = logging.get_logger(__name__)
-
-
-def incrementor(bucket_size, prompt_len):
-    assert bucket_size > 0
-    passnum = -1
-    while True:
-        passnum += 1
-        if passnum == 0:
-            token_idx = prompt_len
-            allocated_space = int(math.ceil(prompt_len / bucket_size) * bucket_size)
-            if prompt_len % bucket_size == 0:
-                allocated_space += bucket_size
-            need_expansion = True
-        else:
-            token_idx += 1
-            need_expansion = token_idx >= allocated_space
-            if need_expansion:
-                assert (allocated_space - token_idx) <= bucket_size
-                allocated_space += bucket_size
-        yield {
-            "allocated_space": allocated_space,
-            "passnum": passnum,
-            "token_idx": token_idx,
-            "need_expansion": need_expansion,
-        }
-
-
-def get_final_stopping_criteria(x):
-    if isinstance(x, bool):
-        return x
-    elif torch.is_tensor(x):
-        return all(x)
-    else:
-        raise TypeError(f"The stopping criteria should be either a boolean or a torch.tensor but got {type(x)}.")
-
-
-class GaudiGenerationMixin(GenerationMixin):
-    """
-    This class enables to perform fast generation in lazy mode and with HPU graphs.
-    The only difference with GenerationMixin is that the various generation
-    methods will generate sequences whose size is max_length. Having constant
-    sizes allows to make the most of lazy mode and HPU graphs.
-    """
-
-    def _get_hpu_graphs_kwargs(self, model_kwargs):
-        hpu_graphs_kwargs = {}
-        if model_kwargs["limit_hpu_graphs"]:
-            hpu_graphs_kwargs.update({"bypass_hpu_graphs": False})
-            if "first_token" not in model_kwargs.keys():
-                model_kwargs["first_token"] = True
-                hpu_graphs_kwargs.update({"bypass_hpu_graphs": True})
-        return hpu_graphs_kwargs
-
-    def _prepare_decoder_attention_mask(
-        self,
-        max_steps: int,  # current stopping criteria
-        batch_size: int,
-        pad_token_id: int,
-        device: str,
-        dtype: str = bool,
-    ) -> torch.Tensor:
-        x = torch.zeros((batch_size, max_steps), device=device, dtype=dtype)
-        return x.index_fill(1, torch.tensor(0), 1)  # First the position with pad_token_id
-
-    def _prepare_decoder_input_ids_for_generation(
-        self,
-        batch_size: int,
-        model_input_name: str,
-        model_kwargs: Dict[str, torch.Tensor],
-        decoder_start_token_id: Union[int, List[int]] = None,
-        bos_token_id: int = None,
-        device: torch.device = None,
-        max_new_tokens: int = None,
-        pad_token_id: int = None,
-    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
-        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
-        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
-        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
-
-        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
-            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
-        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
-            decoder_input_ids = model_kwargs.pop("input_ids")
-        else:
-            decoder_input_ids = None
-
-        token_idx = model_kwargs.get("token_idx", None)
-
-        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
-        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
-        if device is None:
-            device = self.device
-        if token_idx is None:
-            if isinstance(decoder_start_token_id, list):
-                if len(decoder_start_token_id) != batch_size:
-                    raise ValueError(
-                        f"`decoder_start_token_id` expcted to have length {batch_size} but got {len(decoder_start_token_id)}"
-                    )
-                decoder_input_ids_start = torch.tensor(decoder_start_token_id, dtype=torch.long, device=device)
-                decoder_input_ids_start = decoder_input_ids_start.view(-1, 1)
-            else:
-                decoder_input_ids_start = (
-                    torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
-                )
-        else:
-            # creating padded decoder_input_ids to achieve static shapes. Later new tokens once generated are copied in to decoder_input_ids based on token_idx
-            max_length = max_new_tokens + 1 if max_new_tokens is not None else self.generation_config.max_length
-            decoder_input_ids_start = (
-                torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
-            )
-            decoder_input_ids_start = torch.nn.functional.pad(
-                decoder_input_ids_start, (0, max_length - 1), value=pad_token_id
-            )
-
-        # no user input -> use decoder_start_token_id as decoder_input_ids
-        if decoder_input_ids is None:
-            decoder_input_ids = decoder_input_ids_start
-        # exception: Donut checkpoints have task-specific decoder starts and don't expect a BOS token
-        elif self.config.model_type == "vision-encoder-decoder" and "donut" in self.name_or_path.lower():
-            pass
-        elif self.config.model_type in ["whisper"]:
-            pass
-        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
-        # decoder_attention_mask if provided)
-        elif (
-            isinstance(decoder_start_token_id, int)
-            and (decoder_input_ids[:, 0] != decoder_start_token_id).all().item()
-        ) or (
-            isinstance(decoder_start_token_id, torch.Tensor)
-            and (decoder_input_ids[:, 0] != decoder_start_token_id[:, 0]).all().item()
-        ):
-            if token_idx is None:
-                decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
-            else:
-                max_length = max_new_tokens + 2 if max_new_tokens is not None else self.generation_config.max_length
-                if max_length != decoder_input_ids_start.shape[-1]:
-                    decoder_input_ids_start = torch.nn.functional.pad(
-                        decoder_input_ids_start,
-                        (0, max_length - decoder_input_ids_start.shape[-1]),
-                        value=pad_token_id,
-                    )
-                decoder_input_ids = decoder_input_ids_start.index_copy(1, token_idx, decoder_input_ids)
-                token_idx.add_(1)
-            if "decoder_attention_mask" in model_kwargs:
-                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
-                decoder_attention_mask = torch.cat(
-                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
-                    dim=-1,
-                )
-                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
-        else:
-            if token_idx is not None:
-                decoder_input_ids_len = decoder_input_ids.shape[-1]
-                max_length = (
-                    max_new_tokens + decoder_input_ids_len
-                    if max_new_tokens is not None
-                    else self.generation_config.max_length
-                )
-                decoder_input_ids = torch.nn.functional.pad(
-                    decoder_input_ids, (0, max_length - decoder_input_ids_len), value=pad_token_id
-                )
-                token_idx.copy_(decoder_input_ids_len)
-                if "decoder_attention_mask" in model_kwargs:
-                    decoder_attention_mask = model_kwargs["decoder_attention_mask"]
-                    pad_len = max_length - decoder_attention_mask.shape[-1]
-                    decoder_attention_mask = torch.cat(
-                        (torch.ones_like(decoder_attention_mask)[:, :pad_len], decoder_attention_mask),
-                        dim=-1,
-                    )
-                    model_kwargs["decoder_attention_mask"] = decoder_attention_mask
-
-        return decoder_input_ids, model_kwargs
-
-    @staticmethod
-    def _expand_inputs_for_generation(
-        expand_size: int = 1,
-        is_encoder_decoder: bool = False,
-        input_ids: Optional[torch.LongTensor] = None,
-        **model_kwargs,
-    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
-        """
-        Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...].
-
-        Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L704
-        The tensor `token_idx` is not expanded.
-        """
-
-        def _expand_dict_for_generation(dict_to_expand):
-            for key in dict_to_expand:
-                if (
-                    key != "token_idx"
-                    and key != "decoder_input_ids"
-                    and key != "cache_position"
-                    and dict_to_expand[key] is not None
-                    and isinstance(dict_to_expand[key], torch.Tensor)
-                ):
-                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
-            return dict_to_expand
-
-        if input_ids is not None:
-            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
-
-        model_kwargs = _expand_dict_for_generation(model_kwargs)
-
-        if is_encoder_decoder:
-            if model_kwargs.get("encoder_outputs") is None:
-                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
-            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
-
-        return input_ids, model_kwargs
-
-    def _pad_past_key_values(self, model_kwargs):
-        pad_amount = model_kwargs.get("kv_cache_pad_len", 0)
-        if model_kwargs["past_key_values"]:
-            for i in range(len(model_kwargs["past_key_values"])):
-                for j in range(len(model_kwargs["past_key_values"][i])):
-                    if torch.is_tensor(model_kwargs["past_key_values"][i][j]):
-                        model_kwargs["past_key_values"][i][j] = torch.nn.functional.pad(
-                            model_kwargs["past_key_values"][i][j], (0, 0, 0, pad_amount)
-                        )
-                        if model_kwargs.get("lazy_mode", False):
-                            self.htcore_generation.mark_step()
-
-    def _remove_past_key_values(self, model_kwargs):
-        if model_kwargs["past_key_values"]:
-            for i in range(len(model_kwargs["past_key_values"])):
-                for j in range(len(model_kwargs["past_key_values"][i])):
-                    if torch.is_tensor(model_kwargs["past_key_values"][i][j]):
-                        t = model_kwargs["past_key_values"][i][j]
-                        del t
-                        model_kwargs["past_key_values"][i][j] = None
-        del model_kwargs["past_key_values"]
-        model_kwargs["past_key_values"] = None
-
-    def _update_model_kwargs_for_generation(
-        self,
-        outputs: ModelOutput,
-        model_kwargs: Dict[str, Any],
-        is_encoder_decoder: bool = False,
-        standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        """
-        Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L745
-
-        Adds support for `token_idx`, which is necessary for using static shapes.
-        """
-        # mark to identify starting from second token
-        model_kwargs["first_token"] = False
-        if not model_kwargs.get("pad_done", False):
-            # update past_key_values
-            model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-                outputs, standardize_cache_format=standardize_cache_format
-            )
-        if getattr(outputs, "state", None) is not None:
-            model_kwargs["state"] = outputs.state
-
-        # update token_type_ids with last value
-        if "token_type_ids" in model_kwargs:
-            token_type_ids = model_kwargs["token_type_ids"]
-            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
-
-        token_idx = model_kwargs.get("token_idx", None)
-
-        if not is_encoder_decoder:
-            # update attention mask
-            if "attention_mask" in model_kwargs:
-                attention_mask = model_kwargs["attention_mask"]
-                if token_idx is not None:
-                    attention_mask.index_fill_(1, token_idx, 1)
-                else:
-                    attention_mask = torch.cat(
-                        [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-                    )
-                model_kwargs["attention_mask"] = attention_mask
-        else:
-            # update decoder attention mask
-            if "decoder_attention_mask" in model_kwargs:
-                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
-                if token_idx is not None:
-                    decoder_attention_mask.index_fill_(1, token_idx, 1)
-                else:
-                    decoder_attention_mask = torch.cat(
-                        [
-                            decoder_attention_mask,
-                            decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1)),
-                        ],
-                        dim=-1,
-                    )
-                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
-
-        if token_idx is not None:
-            token_idx.add_(1)
-            if "token_idx_cpu" in model_kwargs:
-                model_kwargs["token_idx_cpu"] += 1
-
-        if "cache_position" in model_kwargs and model_kwargs["cache_position"] is not None:
-            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
-
-        return model_kwargs
-
-    @torch.no_grad()
-    def update_model_kwargs_for_bucketing(
-        self, params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile=False
-    ):
-        if params["need_expansion"]:
-            # Pad inputs to have static shapes during generation, this gives better performance than dynamic shapes on HPUs
-            pad_amount = params["allocated_space"] - input_ids.shape[-1]
-            input_ids = torch.nn.functional.pad(input_ids, (0, pad_amount), value=pad_token_id)
-            if model_kwargs["attention_mask"] is not None:
-                model_kwargs["attention_mask"] = torch.nn.functional.pad(
-                    model_kwargs["attention_mask"], (0, pad_amount), value=0
-                )
-            else:
-                assert False, "Not tested for cases where attn_mask isnt passed"
-            if reduce_recompile and params["passnum"] == 0:
-                position_ids_cpu = model_kwargs["attention_mask"].long().cumsum(-1) - 1
-                position_ids_cpu.masked_fill_(model_kwargs["attention_mask"] == 0, 1)
-                input_ids = input_ids.to(self.device)
-                model_kwargs["attention_mask"] = model_kwargs["attention_mask"].to(self.device)
-
-            if "past_key_values" in model_kwargs:
-
-                def create_pad_arg(pad_amount, i, j):
-                    if model_kwargs["past_key_values"][0][0].dim() == 3:
-                        assert self.config.model_type == "bloom"
-                        if j == 0:
-                            return (0, pad_amount)
-                        elif j == 1:
-                            return (0, 0, 0, pad_amount)
-                        else:
-                            assert False
-                    elif model_kwargs["past_key_values"][0][0].dim() == 4:
-                        return (0, 0, 0, pad_amount)  # llama, falcon, qwen2
-                    else:
-                        assert False, "Unknown case, please handle, or dont use bucketing"
-
-                new_kv = [None for i in range(len(model_kwargs["past_key_values"]))]
-                if self.config.model_type == "gpt_bigcode" and model_kwargs["past_key_values"][0][0].dim() == 2:
-                    # GPT_BIGCODE's kv cache is list of tensors.
-                    new_kv = [None for i in range(len(model_kwargs["past_key_values"]))]
-                    for i in range(len(model_kwargs["past_key_values"])):
-                        pad = (0, 0, 0, pad_amount)
-                        new_kv[i] = torch.nn.functional.pad(
-                            model_kwargs["past_key_values"][i], pad, value=pad_token_id
-                        )
-                    model_kwargs["past_key_values"] = list(new_kv)
-                else:
-                    for i in range(len(model_kwargs["past_key_values"])):
-                        tmp_lst = [None for j in range(len(model_kwargs["past_key_values"][i]))]
-                        for j in range(len(model_kwargs["past_key_values"][i])):
-                            pad_tuple = create_pad_arg(pad_amount, i, j)
-                            # Different models might have different shapes of kv-cache
-                            # create_pad_arg handles them on a per-model basis
-                            # This is a necessary (but not sufficient) condition: what ever dimension we are padding, should be a multiple of bucket_size
-                            # This check is added in case we get a new model with a new kv-cache structure, and we attempt to pad some wrong dimension
-                            # in peft case, if there's virtual token. the model_kwargs["past_key_values"][i][j].shape[-(len(pad_tuple) // 2)] % bucket_size == num_virtual_token, no need of assert, the pad length of past_key_value should be aligned with input id and attention_mask
-                            num_virtual_tokens = model_kwargs.get("num_virtual_tokens", 0)
-                            assert (
-                                model_kwargs["past_key_values"][i][j].shape[-(len(pad_tuple) // 2)] % bucket_size
-                                == num_virtual_tokens
-                            )
-                            tmp_lst[j] = torch.nn.functional.pad(
-                                model_kwargs["past_key_values"][i][j], pad_tuple, value=pad_token_id
-                            )
-                        new_kv[i] = tuple(tmp_lst)
-                    model_kwargs["past_key_values"] = tuple(new_kv)
-
-        if "token_idx" not in model_kwargs:
-            model_kwargs["token_idx"] = torch.tensor(params["token_idx"], device=self.device)
-        return input_ids, model_kwargs
-
-    def _get_candidate_generator(
-        self,
-        generation_config: GaudiGenerationConfig,
-        input_ids: torch.LongTensor,
-        inputs_tensor: torch.Tensor,
-        assistant_model: "PreTrainedModel",
-        logits_processor: LogitsProcessorList,
-        model_kwargs: Dict,
-    ) -> CandidateGenerator:
-        if generation_config.prompt_lookup_num_tokens is not None:
-            candidate_generator = PromptLookupCandidateGenerator(
-                num_output_tokens=generation_config.prompt_lookup_num_tokens,
-                max_matching_ngram_size=generation_config.max_matching_ngram_size,
-                max_length=generation_config.max_length,
-            )
-        else:
-            candidate_generator = GaudiAssistedCandidateGenerator(
-                input_ids=input_ids,
-                assistant_model=assistant_model,
-                generation_config=generation_config,
-                logits_processor=logits_processor,
-                model_kwargs=model_kwargs,
-                inputs_tensor=inputs_tensor,
-            )
-        return candidate_generator
-
-    def _get_stopping_criteria(
-        self,
-        generation_config: GaudiGenerationConfig,
-        stopping_criteria: Optional[StoppingCriteriaList],
-        ignore_eos: bool = False,
-    ) -> StoppingCriteriaList:
-        criteria = StoppingCriteriaList()
-        if generation_config.max_length is not None:
-            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
-            criteria.append(
-                MaxLengthCriteria(
-                    max_length=generation_config.max_length,
-                    max_position_embeddings=max_position_embeddings,
-                )
-            )
-        if generation_config.max_time is not None:
-            criteria.append(MaxTimeCriteria(max_time=generation_config.max_time))
-        if not ignore_eos and generation_config.eos_token_id is not None:
-            criteria.append(EosTokenCriteria(eos_token_id=generation_config.eos_token_id))
-        criteria = self._merge_criteria_processor_list(criteria, stopping_criteria)
-        return criteria
-
-    def _prepare_generated_length(
-        self,
-        generation_config,
-        has_default_max_length,
-        has_default_min_length,
-        model_input_name,
-        input_ids_length,
-        inputs_tensor,
-        has_token_idx,
-    ):
-        """Prepared max and min length in generaion configs to avoid clashes between similar attributes"""
-
-        if generation_config.max_new_tokens is not None:
-            if not has_default_max_length and generation_config.max_length is not None:
-                logger.warning(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
-                )
-            if has_token_idx:
-                generation_config.max_length = input_ids_length
-            else:
-                generation_config.max_length = generation_config.max_new_tokens + input_ids_length
-
-        # if both `inputs_embeds` and `input_ids` are passed, we do not correct the length
-        # otherwise we need total length [inputs-embeds-len + new-tokens-len] to not go beyond indicated `max_length``
-        elif (
-            model_input_name == "inputs_embeds"
-            and input_ids_length != inputs_tensor.shape[1]
-            and not self.config.is_encoder_decoder
-        ):
-            generation_config.max_length -= inputs_tensor.shape[1]
-
-        # same for min length
-        if generation_config.min_new_tokens is not None:
-            if not has_default_min_length:
-                logger.warning(
-                    f"Both `min_new_tokens` (={generation_config.min_new_tokens}) and `min_length`(="
-                    f"{generation_config.min_length}) seem to have been set. `min_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
-                )
-            if has_token_idx:
-                generation_config.min_length = input_ids_length
-            else:
-                generation_config.min_length = generation_config.min_new_tokens + input_ids_length
-
-        elif (
-            model_input_name == "inputs_embeds"
-            and input_ids_length != inputs_tensor.shape[1]
-            and not self.config.is_encoder_decoder
-        ):
-            generation_config.min_length = max(generation_config.min_length - inputs_tensor.shape[1], 0)
-
-        return generation_config
-
-    def _prepare_generation_config(
-        self, generation_config: GaudiGenerationConfig, **kwargs: Dict
-    ) -> Tuple[GaudiGenerationConfig, Dict]:
-        """
-        Copied from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/generation/utils.py#L1230
-        Differences:
-        - add management of `static_shapes` and `ignore_eos` in the generation config
-        - workaround for `token_type_ids` for Falcon
-        """
-        # TODO joao: when we can detect `fullgraph=True` in `torch.compile` (https://github.com/pytorch/pytorch/pull/120400)
-        # replace `is_torchdynamo_compiling` by the corresponding check. As it is, we are being too restrictive with
-        # the parameterization in `fullgraph=False` so as to enable `fullgraph=True`.
-
-        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
-        if generation_config is None:
-            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
-            # three conditions must be met
-            # 1) the generation config must have been created from the model config (`_from_model_config` field);
-            # 2) the generation config must have seen no modification since its creation (the hash is the same);
-            # 3) the user must have set generation parameters in the model config.
-            # NOTE: `torch.compile` can't compile `hash`, this legacy support is disabled with compilation.
-            if (
-                not is_torchdynamo_compiling()
-                and self.generation_config._from_model_config
-                and self.generation_config._original_object_hash == hash(self.generation_config)
-                and self.config._has_non_default_generation_parameters()
-            ):
-                new_generation_config = GaudiGenerationConfig.from_model_config(self.config)
-                if new_generation_config != self.generation_config:
-                    warnings.warn(
-                        "You have modified the pretrained model configuration to control generation. This is a"
-                        " deprecated strategy to control generation and will be removed soon, in a future version."
-                        " Please use and modify the model generation configuration (see"
-                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
-                    )
-                    self.generation_config = new_generation_config
-            generation_config = self.generation_config
-
-        # `torch.compile` can't compile `copy.deepcopy`, arguments in `kwargs` that are part of `generation_config`
-        # will mutate the object with `.update`. As such, passing these arguments through `kwargs` is disabled.
-        if is_torchdynamo_compiling():
-            model_kwargs = kwargs
-            generate_attributes_in_kwargs = [
-                key for key, value in kwargs.items() if getattr(generation_config, key, None) != value
-            ]
-            if len(generate_attributes_in_kwargs) > 0:
-                raise ValueError(
-                    "`torch.compile` exception: all generation configuration attributes must be passed within a "
-                    f"`generation_config` instance passed to `generate` (found: {generate_attributes_in_kwargs})."
-                )
-        else:
-            generation_config = copy.deepcopy(generation_config)
-            if generation_config.static_shapes is None:
-                generation_config.static_shapes = self.config.model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES
-                if self.config.model_type == "vision-encoder-decoder":
-                    generation_config.static_shapes = (
-                        self.config.decoder.model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES
-                    )
-            self.generation_config.static_shapes = generation_config.static_shapes
-            if generation_config.ignore_eos is None:
-                generation_config.ignore_eos = kwargs.get("ignore_eos", kwargs.get("lazy_mode", None))
-                self.generation_config.ignore_eos = generation_config.ignore_eos
-            model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
-            if self.config.model_type == "falcon" and "token_type_ids" in kwargs.keys():
-                for key in ["token_type_ids"]:
-                    model_kwargs.pop(key, None)
-
-        return generation_config, model_kwargs
-
-    @torch.no_grad()
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        generation_config: Optional[GaudiGenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-        synced_gpus: Optional[bool] = None,
-        assistant_model: Optional["PreTrainedModel"] = None,
-        streamer: Optional["BaseStreamer"] = None,
-        negative_prompt_ids: Optional[torch.Tensor] = None,
-        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
-        lazy_mode: Optional[bool] = False,
-        hpu_graphs: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        iteration_times: Optional[List[float]] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head.
-
-        <Tip warning={true}>
-
-        Most generation-controlling parameters are set in [`transformers.generation.generation_config`] which, if not passed, will be set to the
-        model's default generation configuration. You can override any `generation_config` by passing the corresponding
-        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
-
-        For an overview of generation strategies and code examples, check out the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Most of these parameters are explained in more detail in [this blog
-        post](https://huggingface.co/blog/how-to-generate).
-        Parameters:
-            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
-                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
-                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
-                should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
-                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
-            generation_config (`transformers.generation.GenerationConfig`, *optional*):
-                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
-                passed to generate matching the attributes of `generation_config` will override them. If
-                `generation_config` is not provided, the default will be used, which has the following loading
-                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
-                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
-                default values, whose documentation should be checked to parameterize generation.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                Custom logits processors that complement the default logits processors built from arguments and
-                generation config. If a logit processor is passed that is already created with the arguments or a
-                generation config an error is thrown. This feature is intended for advanced users.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                Custom stopping criteria that complements the default stopping criteria built from arguments and a
-                generation config. If a stopping criteria is passed that is already created with the arguments or a
-                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
-                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
-                intended for advanced users.
-            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
-                If provided, this function constraints the beam search to allowed tokens only at each step. If not
-                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
-                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
-                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
-                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
-                Retrieval](https://arxiv.org/abs/2010.00904).
-            synced_gpus (`bool`, *optional*):
-                Whether to continue running the while loop until max_length. Unless overridden this flag will be set to
-                `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished
-                generating before other GPUs. Otherwise it'll be set to `False`.
-            assistant_model (`PreTrainedModel`, *optional*):
-                An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
-                is much faster than running generation with the model you're calling generate from. As such, the
-                assistant model should be much smaller.
-            streamer (`BaseStreamer`, *optional*):
-                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
-                size. This is an experimental feature, subject to breaking API changes in future versions.
-            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Attention_mask for `negative_prompt_ids`.
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            hpu_graphs (`bool`, *optional*, defaults to `False`):
-                Whether to use HPU graphs for inference.
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            kwargs (`Dict[str, Any]`, *optional*):
-                Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
-                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
-                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
-
-        Return:
-            [`transformers.utils.ModelOutput`] or `torch.LongTensor`: A [`transformers.generationutils.ModelOutput`] (if `return_dict_in_generate=True`
-            or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`.
-                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
-                [`transformers.generationutils.ModelOutput`] types are:
-                    - [`transformers.generation.GenerateDecoderOnlyOutput`],
-                    - [`transformers.generation.GenerateBeamDecoderOnlyOutput`]
-                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
-                [`transformers.generationutils.ModelOutput`] types are:
-                    - [`transformers.generation.GenerateEncoderDecoderOutput`],
-                    - [`transformers.generation.GenerateBeamEncoderDecoderOutput`]
-        """
-        if iteration_times is not None:
-            hb_gen_time = HabanaGenerationtime(iteration_times=iteration_times)
-            hb_gen_time.start()
-        else:
-            hb_gen_time = None
-        if synced_gpus is None:
-            if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1:
-                synced_gpus = True
-            else:
-                synced_gpus = False
-
-        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
-        self._validate_model_class()
-        if hpu_graphs and not lazy_mode:
-            raise ValueError(
-                "`hpu_graphs` is True but `lazy_mode` is False. HPU graphs require `lazy_mode` to be set to True."
-            )
-        num_virtual_tokens = kwargs.pop("num_virtual_tokens", 0)
-        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
-        self._validate_model_kwargs(model_kwargs.copy())
-
-        # 2. Set generation parameters if not already defined
-        if synced_gpus is None:
-            if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1:
-                synced_gpus = True
-            else:
-                synced_gpus = False
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
-            if model_kwargs.get("attention_mask", None) is None:
-                logger.warning(
-                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
-                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
-                )
-            eos_token_id = generation_config.eos_token_id
-            if isinstance(eos_token_id, list):
-                eos_token_id = eos_token_id[0]
-            logger.warning(
-                f"Setting `pad_token_id` to `eos_token_id`:{generation_config.eos_token_id} for open-end generation."
-            )
-            generation_config.pad_token_id = eos_token_id
-
-        # 3. Define model inputs
-        # inputs_tensor has to be defined
-        # model_input_name is defined if model-specific keyword input is passed
-        # otherwise model_input_name is None
-        # all model-specific keyword inputs are removed from `model_kwargs`
-        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
-            inputs, generation_config.bos_token_id, model_kwargs
-        )
-        batch_size = inputs_tensor.shape[0]
-
-        # 4. Define other model kwargs
-        model_kwargs["output_attentions"] = generation_config.output_attentions
-        model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
-        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
-        # generating the first new token or not, and we only want to use the embeddings for the first new token)
-        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
-            model_kwargs["use_cache"] = True
-        else:
-            model_kwargs["use_cache"] = generation_config.use_cache
-
-        self.generation_config.max_length = generation_config.max_length
-
-        accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
-        requires_attention_mask = "encoder_outputs" not in model_kwargs
-
-        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
-            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
-            )
-
-        is_greedy_or_beam_and_bucket = (
-            not generation_config.bucket_internal
-            and generation_config.bucket_size > 0
-            and (
-                generation_config.get_generation_mode(assistant_model) == GenerationMode.GREEDY_SEARCH
-                or generation_config.get_generation_mode(assistant_model) == GenerationMode.BEAM_SEARCH
-            )
-        )
-        model_kwargs["bucket_size"] = generation_config.bucket_size if generation_config.static_shapes else -1
-        model_kwargs["bucket_internal"] = generation_config.bucket_internal
-        model_kwargs["reduce_recompile"] = (
-            generation_config.reduce_recompile if generation_config.reduce_recompile is not None else False
-        )
-        if model_kwargs["reduce_recompile"]:
-            assert generation_config.bucket_size
-        # Below condition checked explicitly since llama supports bucket_internal even without reuse_cache
-        if generation_config.bucket_internal:
-            assert generation_config.bucket_size >= 0, "please set bucket_size to use bucket_internal"
-        if generation_config.reuse_cache:
-            assert self.config.model_type in [
-                "llama",
-                "mistral",
-                "falcon",
-                "mixtral",
-                "phi",
-                "qwen2",
-            ], "reuse_cache only supported by llama, mistral, falcon, mixtral, phi and qwen2 at the moment"
-            if not generation_config.bucket_internal:
-                assert (
-                    generation_config.bucket_size <= 0
-                ), "please set bucket_internal along with reuse_cache and bucket_size"
-            else:
-                assert generation_config.bucket_size >= 0, "please set valid bucket_size to use bucket_internal"
-
-        if generation_config.static_shapes:
-            # Pad inputs to have static shapes during generation, this gives better performance than dynamic shapes on HPUs
-            # In encoder_decoder models, Inputs are already padded
-
-            if not self.config.is_encoder_decoder:
-                # only pad if bucket_size < -1. If we are bucketing (bucket_size > 0), then that is taken care in greedy_search()
-                if not is_greedy_or_beam_and_bucket:
-                    # token_idx is the current index in the generation process, it is incremented each time a new token is generated
-                    token_idx = inputs_tensor.shape[-1]
-                    model_kwargs["token_idx"] = torch.tensor(token_idx, device=inputs_tensor.device)
-                    model_kwargs["token_idx_cpu"] = token_idx
-                    if generation_config.max_new_tokens is None:
-                        generation_config.max_new_tokens = generation_config.max_length - token_idx
-                    inputs_tensor = torch.nn.functional.pad(
-                        inputs_tensor, (0, generation_config.max_new_tokens), value=generation_config.pad_token_id
-                    )
-                    for other_inputs in ["attention_mask", "token_type_ids"]:
-                        if model_kwargs.get(other_inputs) is not None:
-                            model_kwargs[other_inputs] = torch.nn.functional.pad(
-                                model_kwargs[other_inputs], (0, generation_config.max_new_tokens), value=0
-                            )
-            else:
-                assert generation_config.bucket_size <= 0, "Untested path for bucket>0"
-                token_idx = 1
-                model_kwargs["token_idx"] = torch.tensor(token_idx, device=inputs_tensor.device)
-                if model_kwargs.get("decoder_attention_mask", None) is None and generation_config.use_cache:
-                    max_length = (
-                        generation_config.max_new_tokens + 1
-                        if generation_config.max_new_tokens is not None
-                        else generation_config.max_length
-                    )
-                    model_kwargs["decoder_attention_mask"] = self._prepare_decoder_attention_mask(
-                        max_length,
-                        inputs_tensor.shape[0],
-                        generation_config.pad_token_id,
-                        inputs_tensor.device,
-                    )
-
-        # decoder-only models should use left-padding for generation
-        if not self.config.is_encoder_decoder:
-            # If `input_ids` was given, check if the last id in any sequence is `pad_token_id`
-            # Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off.
-            if generation_config.pad_token_id is not None:
-                position = model_kwargs["token_idx"] - 1 if "token_idx" in model_kwargs else -1
-                if (
-                    len(inputs_tensor.shape) == 2
-                    and torch.sum(inputs_tensor[:, position] == generation_config.pad_token_id) > 0
-                ):
-                    logger.warning(
-                        "A decoder-only architecture is being used, but right-padding was detected! For correct "
-                        "generation results, please set `padding_side='left'` when initializing the tokenizer."
-                    )
-
-        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
-            # if model is encoder decoder encoder_outputs are created
-            # and added to `model_kwargs`
-            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
-                inputs_tensor, model_kwargs, model_input_name
-            )
-
-        # 5. Prepare `input_ids` which will be used for auto-regressive generation
-        if self.config.is_encoder_decoder:
-            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
-                batch_size=batch_size,
-                model_input_name=model_input_name,
-                model_kwargs=model_kwargs,
-                decoder_start_token_id=generation_config.decoder_start_token_id,
-                bos_token_id=generation_config.bos_token_id,
-                device=inputs_tensor.device,
-                max_new_tokens=generation_config.max_new_tokens,
-                pad_token_id=generation_config.pad_token_id,
-            )
-        else:
-            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
-
-        if streamer is not None:
-            streamer.put(input_ids.cpu())
-
-        # 6. Prepare `max_length` depending on other stopping criteria.
-        input_ids_length = input_ids.shape[-1]
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
-        generation_config = self._prepare_generated_length(
-            generation_config=generation_config,
-            has_default_max_length=has_default_max_length,
-            has_default_min_length=has_default_min_length,
-            model_input_name=model_input_name,
-            inputs_tensor=inputs_tensor,
-            input_ids_length=input_ids_length,
-            has_token_idx="token_idx" in model_kwargs,
-        )
-
-        if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
-            if generation_config.cache_implementation == "static":
-                if model_kwargs.get("past_key_values", False) is not False:
-                    raise ValueError(
-                        "Using `past_key_values` argument with `generate()` when using a static KV cache is not supported. Please open an issue in Transformers GitHub repository."
-                    )
-                cache_cls = NEED_SETUP_CACHE_CLASSES_MAPPING["static"]
-                if not callable(getattr(self, "_setup_cache", None)):
-                    raise ValueError(
-                        "The `generation_config` defines a `cache_implementation` that is not compatible with this model."
-                        " Make sure it has a `_setup_cache` function."
-                    )
-                self._setup_cache(cache_cls, max_batch_size=batch_size, max_cache_len=generation_config.max_length)
-
-        self._validate_generated_length(
-            generation_config,
-            model_kwargs["token_idx"].item() if "token_idx" in model_kwargs else input_ids_length,
-            has_default_max_length,
-        )
-
-        # determine whether introduce trim_logits feature
-        model_kwargs["trim_logits"] = generation_config.trim_logits
-
-        # determine whether attention softmax needs to execute in lower precision
-        model_kwargs["attn_softmax_bf16"] = generation_config.attn_softmax_bf16
-
-        # determine whether limit_hpu_graphs needs to be used
-        model_kwargs["use_hpu_graphs"] = hpu_graphs
-        model_kwargs["limit_hpu_graphs"] = generation_config.limit_hpu_graphs
-
-        # prepare for allocate kv cache
-        model_kwargs["reuse_cache"] = generation_config.reuse_cache
-
-        # determine whether flash attention needs to be used
-        model_kwargs["use_flash_attention"] = generation_config.use_flash_attention
-        model_kwargs["flash_attention_recompute"] = True if generation_config.flash_attention_recompute else False
-        model_kwargs["flash_attention_causal_mask"] = True if generation_config.flash_attention_causal_mask else False
-        model_kwargs["flash_attention_fast_softmax"] = (
-            True if generation_config.flash_attention_fast_softmax else False
-        )
-        model_kwargs["num_virtual_tokens"] = num_virtual_tokens
-
-        if hasattr(generation_config, "valid_sequence_lengths"):
-            model_kwargs["valid_sequence_lengths"] = generation_config.valid_sequence_lengths
-        else:
-            model_kwargs["valid_sequence_lengths"] = None
-        if not self.config.is_encoder_decoder:
-            calculated_max_length = input_ids.shape[-1] + num_virtual_tokens
-            if not generation_config.static_shapes and generation_config.max_new_tokens is not None:
-                calculated_max_length = input_ids.shape[-1] + generation_config.max_new_tokens + num_virtual_tokens
-            if generation_config.use_cache and generation_config.reuse_cache:
-                bs, _ = input_ids.shape
-                if not is_greedy_or_beam_and_bucket:
-                    unwrap_deepspeed_model(self).allocate_kv_cache(
-                        bs * generation_config.num_beams, calculated_max_length, token_idx + num_virtual_tokens
-                    )
-            if generation_config.use_cache:
-                model_kwargs["kv_cache_len"] = calculated_max_length
-                model_kwargs["kv_cache_pad_len"] = generation_config.max_new_tokens
-
-            if self.config.model_type in ["llama", "falcon", "mistral", "qwen2"]:
-                if self.config.max_position_embeddings < calculated_max_length:
-                    unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
-
-        # 7. determine generation mode
-        generation_mode = generation_config.get_generation_mode(assistant_model)
-
-        if generation_config.bucket_size > 0:
-            assert generation_config.static_shapes, "bucket_size > 0 can be set only when static_shapes is set"
-        # if generation_config.bucket_size <= 0, padding is handled by the generating fn (like greedy_search)
-        if generation_config.static_shapes and generation_config.bucket_size > 0:
-            assert (
-                generation_mode == GenerationMode.GREEDY_SEARCH
-                or generation_mode == GenerationMode.SAMPLE
-                or generation_mode == GenerationMode.BEAM_SEARCH
-            ), "generation_config.bucket_size > 0 supported only for greedy mode"
-
-        if streamer is not None and (generation_config.num_beams > 1):
-            raise ValueError(
-                "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
-            )
-
-        if self.device.type != input_ids.device.type:
-            warnings.warn(
-                (
-                    "You are calling .generate() with the `input_ids` being on a device type different"
-                    f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
-                    f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
-                    " Please make sure that you have put `input_ids` to the"
-                    f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
-                    " running `.generate()`."
-                ),
-                UserWarning,
-            )
-
-        # 8. prepare distribution pre_processing samplers
-        prepared_logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_length,
-            encoder_input_ids=inputs_tensor,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-            model_kwargs=model_kwargs,
-            negative_prompt_ids=negative_prompt_ids,
-            negative_prompt_attention_mask=negative_prompt_attention_mask,
-        )
-
-        # 9. prepare stopping criteria
-        self.generation_config.generation_mode = generation_mode
-        prepared_stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config,
-            stopping_criteria=stopping_criteria,
-            ignore_eos=self.generation_config.ignore_eos,
-        )
-
-        # In lazy mode, import Habana torch to be able to add mark_step()
-        if lazy_mode:
-            import habana_frameworks.torch.core as htcore
-
-            self.htcore_generation = htcore
-
-        # 10. go into different generation modes
-        if generation_mode == GenerationMode.ASSISTED_GENERATION:
-            if generation_config.num_return_sequences > 1:
-                raise ValueError(
-                    "num_return_sequences has to be 1 when doing assisted generate, "
-                    f"but is {generation_config.num_return_sequences}."
-                )
-            if batch_size > 1:
-                raise ValueError("assisted generate is only supported for batch_size = 1")
-            if not model_kwargs["use_cache"]:
-                raise ValueError("assisted generate requires `use_cache=True`")
-
-            # 11. Get the candidate generator, given the parameterization
-            candidate_generator = self._get_candidate_generator(
-                generation_config=generation_config,
-                input_ids=input_ids,
-                inputs_tensor=inputs_tensor,
-                assistant_model=assistant_model,
-                logits_processor=logits_processor,
-                model_kwargs=model_kwargs,
-            )
-
-            # 12. run assisted generate
-            result = self._assisted_decoding(
-                input_ids,
-                candidate_generator=candidate_generator,
-                do_sample=generation_config.do_sample,
-                logits_processor=prepared_logits_processor,
-                logits_warper=self._get_logits_warper(generation_config) if generation_config.do_sample else None,
-                stopping_criteria=prepared_stopping_criteria,
-                pad_token_id=generation_config.pad_token_id,
-                output_scores=generation_config.output_scores,
-                output_logits=generation_config.output_logits,
-                return_dict_in_generate=generation_config.return_dict_in_generate,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                lazy_mode=lazy_mode,
-                ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
-                hb_gen_time=hb_gen_time,
-                **model_kwargs,
-            )
-        if generation_mode == GenerationMode.GREEDY_SEARCH:
-            # 11. run greedy search
-            result = self._greedy_search(
-                input_ids,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
-                pad_token_id=generation_config.pad_token_id,
-                output_scores=generation_config.output_scores,
-                output_logits=generation_config.output_logits,
-                return_dict_in_generate=generation_config.return_dict_in_generate,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                lazy_mode=lazy_mode,
-                ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
-                hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
-                **model_kwargs,
-            )
-
-        elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
-            if not model_kwargs["use_cache"]:
-                raise ValueError("Contrastive search requires `use_cache=True`")
-
-            result = self._contrastive_search(
-                input_ids,
-                top_k=generation_config.top_k,
-                penalty_alpha=generation_config.penalty_alpha,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
-                pad_token_id=generation_config.pad_token_id,
-                output_scores=generation_config.output_scores,
-                output_logits=generation_config.output_logits,
-                return_dict_in_generate=generation_config.return_dict_in_generate,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                sequential=generation_config.low_memory,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
-                hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
-                **model_kwargs,
-            )
-
-        elif generation_mode == GenerationMode.SAMPLE:
-            # 11. prepare logits warper
-            logits_warper = self._get_logits_warper(generation_config)
-
-            # 12. expand input_ids with `num_return_sequences` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_return_sequences,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-
-            # 13. run sample
-            result = self._sample(
-                input_ids,
-                logits_processor=prepared_logits_processor,
-                logits_warper=logits_warper,
-                stopping_criteria=prepared_stopping_criteria,
-                pad_token_id=generation_config.pad_token_id,
-                output_scores=generation_config.output_scores,
-                output_logits=generation_config.output_logits,
-                return_dict_in_generate=generation_config.return_dict_in_generate,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                lazy_mode=lazy_mode,
-                ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
-                hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
-                **model_kwargs,
-            )
-
-        elif generation_mode == GenerationMode.BEAM_SEARCH:
-            # 11. prepare beam search scorer
-            beam_scorer = BeamSearchScorer(
-                batch_size=batch_size,
-                num_beams=generation_config.num_beams,
-                device=inputs_tensor.device,
-                length_penalty=generation_config.length_penalty,
-                do_early_stopping=generation_config.early_stopping,
-                num_beam_hyps_to_keep=generation_config.num_return_sequences,
-                max_length=generation_config.max_length,
-            )
-            # 12. interleave input_ids with `num_beams` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_beams,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-            # 13. run beam search
-            result = self._beam_search(
-                input_ids,
-                beam_scorer,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
-                pad_token_id=generation_config.pad_token_id,
-                output_scores=generation_config.output_scores,
-                output_logits=generation_config.output_logits,
-                return_dict_in_generate=generation_config.return_dict_in_generate,
-                synced_gpus=synced_gpus,
-                sequential=generation_config.low_memory,
-                lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
-                hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
-                **model_kwargs,
-            )
-
-        elif generation_mode == GenerationMode.BEAM_SAMPLE:
-            # 11. prepare logits warper
-            logits_warper = self._get_logits_warper(generation_config)
-
-            # 12. prepare beam search scorer
-            beam_scorer = BeamSearchScorer(
-                batch_size=batch_size,
-                num_beams=generation_config.num_beams,
-                device=inputs_tensor.device,
-                length_penalty=generation_config.length_penalty,
-                do_early_stopping=generation_config.early_stopping,
-                num_beam_hyps_to_keep=generation_config.num_return_sequences,
-                max_length=generation_config.max_length,
-            )
-
-            # 13. interleave input_ids with `num_beams` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_beams,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-
-            # 14. run beam sample
-            result = self._beam_sample(
-                input_ids,
-                beam_scorer,
-                logits_processor=prepared_logits_processor,
-                logits_warper=logits_warper,
-                stopping_criteria=prepared_stopping_criteria,
-                pad_token_id=generation_config.pad_token_id,
-                output_scores=generation_config.output_scores,
-                output_logits=generation_config.output_logits,
-                return_dict_in_generate=generation_config.return_dict_in_generate,
-                synced_gpus=synced_gpus,
-                lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
-                hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
-                **model_kwargs,
-            )
-
-        elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
-            # 11. prepare beam search scorer
-            beam_scorer = BeamSearchScorer(
-                batch_size=batch_size,
-                num_beams=generation_config.num_beams,
-                device=inputs_tensor.device,
-                length_penalty=generation_config.length_penalty,
-                do_early_stopping=generation_config.early_stopping,
-                num_beam_hyps_to_keep=generation_config.num_return_sequences,
-                num_beam_groups=generation_config.num_beam_groups,
-                max_length=generation_config.max_length,
-            )
-            # 12. interleave input_ids with `num_beams` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_beams,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-            # 13. run beam search
-            result = self._group_beam_search(
-                input_ids,
-                beam_scorer,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
-                pad_token_id=generation_config.pad_token_id,
-                output_scores=generation_config.output_scores,
-                output_logits=generation_config.output_logits,
-                return_dict_in_generate=generation_config.return_dict_in_generate,
-                synced_gpus=synced_gpus,
-                lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
-                hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
-                **model_kwargs,
-            )
-
-        elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
-            final_constraints = []
-            if generation_config.constraints is not None:
-                final_constraints = generation_config.constraints
-
-            if generation_config.force_words_ids is not None:
-
-                def typeerror():
-                    raise ValueError(
-                        "`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` "
-                        f"of positive integers, but is {generation_config.force_words_ids}."
-                    )
-
-                if (
-                    not isinstance(generation_config.force_words_ids, list)
-                    or len(generation_config.force_words_ids) == 0
-                ):
-                    typeerror()
-
-                for word_ids in generation_config.force_words_ids:
-                    if isinstance(word_ids[0], list):
-                        if not isinstance(word_ids, list) or len(word_ids) == 0:
-                            typeerror()
-                        if any(not isinstance(token_ids, list) for token_ids in word_ids):
-                            typeerror()
-                        if any(
-                            any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids)
-                            for token_ids in word_ids
-                        ):
-                            typeerror()
-
-                        constraint = DisjunctiveConstraint(word_ids)
-                    else:
-                        if not isinstance(word_ids, list) or len(word_ids) == 0:
-                            typeerror()
-                        if any((not isinstance(token_id, int) or token_id < 0) for token_id in word_ids):
-                            typeerror()
-
-                        constraint = PhrasalConstraint(word_ids)
-                    final_constraints.append(constraint)
-
-            # 11. prepare beam search scorer
-            constrained_beam_scorer = ConstrainedBeamSearchScorer(
-                constraints=final_constraints,
-                batch_size=batch_size,
-                num_beams=generation_config.num_beams,
-                device=inputs_tensor.device,
-                length_penalty=generation_config.length_penalty,
-                do_early_stopping=generation_config.early_stopping,
-                num_beam_hyps_to_keep=generation_config.num_return_sequences,
-                max_length=generation_config.max_length,
-            )
-            # 12. interleave input_ids with `num_beams` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids=input_ids,
-                expand_size=generation_config.num_beams,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
-            )
-            # 13. run beam search
-            result = self._constrained_beam_search(
-                input_ids,
-                constrained_beam_scorer=constrained_beam_scorer,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
-                pad_token_id=generation_config.pad_token_id,
-                output_scores=generation_config.output_scores,
-                output_logits=generation_config.output_logits,
-                return_dict_in_generate=generation_config.return_dict_in_generate,
-                synced_gpus=synced_gpus,
-                lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
-                hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
-                **model_kwargs,
-            )
-
-        if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
-            if not callable(getattr(self, "_reset_cache", None)):
-                raise ValueError(
-                    "A `static_cache` was used to generate but there was a failure when trying to  release the cache. "
-                    " Make sure this model implements a `_reset_cache` function."
-                )
-            self._reset_cache()
-
-        return result
-
-    @torch.no_grad()
-    def _contrastive_search(
-        self,
-        input_ids: torch.LongTensor,
-        top_k: Optional[int] = 1,
-        penalty_alpha: Optional[float] = 0,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        logits_warper: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        streamer: Optional["BaseStreamer"] = None,
-        sequential: Optional[bool] = None,
-        lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        hb_gen_time: Optional[HabanaGenerationtime] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **model_kwargs,
-    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
-        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`~generation.GenerationMixin._contrastive_search`] directly. Use
-        generate() instead. For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            top_k (`int`, *optional*, defaults to 1):
-                The size of the candidate set that is used to re-rank for contrastive search
-            penalty_alpha (`float`, *optional*, defaults to 0):
-                The degeneration penalty for contrastive search; activate when it is larger than 0
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            logits_warper (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
-                to warp the prediction score distribution of the language modeling head applied before multinomial
-                sampling at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            output_logits (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors
-                for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            streamer (`BaseStreamer`, *optional*):
-                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            model_kwargs:
-                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
-                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`transformers.generation.GenerateDecoderOnlyOutput`],
-            [`transformers.generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor`
-            containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-
-        Examples:
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForCausalLM,
-        ...     StoppingCriteriaList,
-        ...     MaxLengthCriteria,
-        ... )
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-        >>> model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
-        >>> # set pad_token_id to eos_token_id because OPT does not have a PAD token
-        >>> model.config.pad_token_id = model.config.eos_token_id
-        >>> input_prompt = "DeepMind Company is"
-        >>> input_ids = tokenizer(input_prompt, return_tensors="pt")
-        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=64)])
-        >>> outputs = model._contrastive_search(
-        ...     **input_ids, penalty_alpha=0.6, top_k=4, stopping_criteria=stopping_criteria
-        ... )
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['DeepMind Company is a company that focuses on the development and commercialization of artificial intelligence (AI). DeepMind’s mission is to help people understand and solve problems that are difficult to solve in the world today.\n\nIn this post, we talk about the benefits of deep learning in business and how it']
-        ```"""
-
-        raise NotImplementedError("Contrastive search is not supported by optimum-habana yet.")
-
-    def _greedy_search(
-        self,
-        input_ids: torch.LongTensor,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        streamer: Optional["BaseStreamer"] = None,
-        lazy_mode: Optional[bool] = False,
-        ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        hb_gen_time: Optional[HabanaGenerationtime] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **model_kwargs,
-    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
-        used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`~generation.GenerationMixin._greedy_search`] directly. Use generate()
-        instead. For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            max_length (`int`, *optional*, defaults to 20):
-                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
-                tokens. The maximum length of the sequence to be generated.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            output_logits (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors
-                for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            streamer (`BaseStreamer`, *optional*):
-                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            ignore_eos (`bool`, *optional*, defaults to `False`):
-                Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            model_kwargs:
-                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
-                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`]
-            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-
-        Examples:
-
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForCausalLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     StoppingCriteriaList,
-        ...     MaxLengthCriteria,
-        ... )
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-
-        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
-        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
-
-        >>> input_prompt = "It might be possible to"
-        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
-
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [
-        ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
-        ...     ]
-        ... )
-        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
-
-        >>> outputs = model._greedy_search(
-        ...     input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
-        ... )
-
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
-        ```"""
-        # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        if max_length is not None:
-            warnings.warn(
-                (
-                    "`max_length` is deprecated in this function, use"
-                    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead."
-                ),
-                UserWarning,
-            )
-            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        if not self.generation_config.ignore_eos:
-            if eos_token_id is not None:
-                logger.warning_once(
-                    "`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
-                    " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
-                    " Otherwise make sure to set `model.generation_config.eos_token_id`",
-                    FutureWarning,
-                )
-                stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-            else:
-                # TODO remove when the method is totally private
-                # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
-                eos_token_id = [
-                    criteria.eos_token_id.tolist()
-                    for criteria in stopping_criteria
-                    if hasattr(criteria, "eos_token_id")
-                ]
-                eos_token_id = eos_token_id[0] if eos_token_id else None
-                if eos_token_id is None and self.generation_config.eos_token_id is not None:
-                    eos_token_id = self.generation_config.eos_token_id
-                    stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.generation_config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
-        )
-        return_dict_in_generate = (
-            return_dict_in_generate
-            if return_dict_in_generate is not None
-            else self.generation_config.return_dict_in_generate
-        )
-
-        # init attention / hidden states / scores tuples
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        scores = () if (return_dict_in_generate and output_scores) else None
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # keep track of which sequences are already finished
-        batch_size, cur_len = input_ids.shape
-        if "inputs_embeds" in model_kwargs:
-            cur_len = model_kwargs["inputs_embeds"].shape[1]
-        this_peer_finished = False
-        if not ignore_eos:
-            unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
-            model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
-
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
-        )
-        hb_profer.start()
-        bucket_size = model_kwargs.get("bucket_size", -1)
-        prev_idx = -1  # avoiding calculate cache_idx when its value is not changing
-        bucket_internal = model_kwargs.get("bucket_internal", None)
-        reduce_recompile = model_kwargs.get("reduce_recompile", False)
-
-        if not bucket_internal:
-            if bucket_size >= 0:
-                inc = iter(incrementor(bucket_size, cur_len))
-            if bucket_size > 0:
-                assert "position_ids" not in model_kwargs, "Untested path"
-        greedy_first = True
-        token_idx = model_kwargs.get("token_idx", None)
-        if token_idx is not None:
-            # Update cur_len in case of static shapes
-            cur_len = token_idx.item()
-
-        time_to_first_token_done = False
-        model_kwargs["pad_done"] = False
-        model_kwargs["lazy_mode"] = lazy_mode
-        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            if lazy_mode:
-                self.htcore_generation.mark_step()
-
-            if bucket_size > 0 and not bucket_internal:
-                # it will not have been padded if bucket_size > 0
-                params = next(inc)
-                input_ids, model_kwargs = self.update_model_kwargs_for_bucketing(
-                    params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile
-                )
-
-            # prepare model inputs
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
-
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                **hpu_graphs_kwargs,
-            )
-            if synced_gpus and this_peer_finished:
-                continue  # don't waste resources running the code we don't need
-
-            token_idx = model_kwargs.get("token_idx", None)
-            if token_idx is not None and outputs.logits.shape[-2] > 1:
-                # case1 (w/o KV caching): outputs.logits.shape: [batch_size, max_length, vocab_size]
-                if self.config.is_encoder_decoder:
-                    next_token_logits = outputs.logits[:, token_idx - 1, :]
-                    next_tokens_scores = logits_processor(input_ids[:, :token_idx], next_token_logits)
-                else:
-                    if model_kwargs.get("num_virtual_tokens", 0) > 0:
-                        # for prompt tuning, the output logit shape > model_inputs["input_ids"].shape[-1]
-                        if model_kwargs.get("reuse_cache", False):
-                            output_idx = torch.tensor(outputs.logits.shape[-2], device=input_ids.device)
-                        else:
-                            output_idx = token_idx + outputs.logits.shape[-2] - input_ids.shape[-1]
-                        next_token_logits = torch.index_select(outputs.logits, -2, output_idx - 1).squeeze(-2)
-                    else:
-                        next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
-
-                    next_tokens_scores = logits_processor(input_ids, next_token_logits)
-            else:
-                next_token_logits = outputs.logits[:, -1, :]
-                if token_idx is not None and self.config.is_encoder_decoder:
-                    # case2 (with KV caching): outputs.logits.shape: [batch_size, 1, vocab_size]
-                    next_tokens_scores = logits_processor(input_ids[:, :token_idx], next_token_logits)
-                else:
-                    # case3 (default case): token_idx is None
-                    next_tokens_scores = logits_processor(input_ids, next_token_logits)
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (next_tokens_scores,)
-                if output_logits:
-                    raw_logits += (next_token_logits,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
-
-            # argmax
-            next_tokens = torch.argmax(next_tokens_scores, dim=-1)
-            # finished sentences should have their next token be a padding token
-            if not ignore_eos and eos_token_id is not None:
-                if pad_token_id is None:
-                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-
-            # update generated ids, model inputs, and length for next step
-            if not lazy_mode:
-                next_tokens = next_tokens.to(input_ids.dtype)
-
-            if token_idx is not None:
-                input_ids.index_copy_(
-                    1, token_idx, next_tokens.unsqueeze(-1) if next_tokens.dim() == 1 else next_tokens
-                )
-            else:
-                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            if streamer is not None:
-                streamer.put(next_tokens.cpu())
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
-                model_kwargs,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-            if bucket_size > 0 and bucket_internal:
-                # Calculate slice idx for kv cache during the decode phase.
-                # Breaking down the kv cache in the attention block helps to reduce computation time.
-                if model_kwargs.get("token_idx_cpu") <= (model_kwargs["kv_cache_len"] // bucket_size) * bucket_size:
-                    idx = (model_kwargs.get("token_idx_cpu") - 1) // bucket_size
-                    if prev_idx != idx:
-                        model_kwargs["cache_idx"] = (idx + 1) * bucket_size
-                        prev_idx = idx
-                else:
-                    model_kwargs["cache_idx"] = model_kwargs["kv_cache_len"]
-            cur_len = cur_len + 1
-
-            if ignore_eos:
-                this_peer_finished = stopping_criteria(
-                    input_ids, scores, token_idx=cur_len, ignore_eos=ignore_eos, eos_token_id=eos_token_id
-                )
-            else:
-                unfinished_sequences = unfinished_sequences & ~stopping_criteria(
-                    input_ids, scores, token_idx=cur_len, ignore_eos=ignore_eos, eos_token_id=eos_token_id
-                )
-                this_peer_finished = unfinished_sequences.max() == 0
-            hb_profer.step()
-            if hb_gen_time is not None:
-                if not time_to_first_token_done:
-                    time_to_first_token_done = True
-                    import habana_frameworks.torch.hpu as torch_hpu
-
-                    torch_hpu.synchronize()
-                hb_gen_time.step()
-
-            if greedy_first:
-                import habana_frameworks.torch.hpu as torch_hpu
-
-                torch_hpu.synchronize()
-                print(f"First Token time(greedy):{time.perf_counter()*1000}")
-                greedy_first = False
-
-            if (
-                not model_kwargs.get("pad_done", False)
-                and not model_kwargs.get("reuse_cache", False)
-                and bucket_internal
-            ):
-                # Pad the returned pask key values tensors from prefill phase forward run to maximum length
-                # before starting the decode phase.
-                self._pad_past_key_values(model_kwargs)
-                model_kwargs["pad_done"] = True
-
-        if (
-            model_kwargs.get("use_hpu_graphs", False)
-            and model_kwargs.get("limit_hpu_graphs", False)
-            and not model_kwargs.get("reuse_cache", False)
-            and bucket_internal
-        ):
-            # Clear HPU graphs input tensors of the decode phase after the full generation while loop
-            self.clear_inputs()
-            # Delete past key value tensors
-            self._remove_past_key_values(model_kwargs)
-
-        hb_profer.stop()
-        if streamer is not None:
-            streamer.end()
-
-        if return_dict_in_generate:
-            if self.config.is_encoder_decoder:
-                return GenerateEncoderDecoderOutput(
-                    sequences=input_ids,
-                    scores=scores,
-                    logits=raw_logits,
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-            else:
-                return GenerateDecoderOnlyOutput(
-                    sequences=input_ids,
-                    scores=scores,
-                    logits=raw_logits,
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-        else:
-            return input_ids
-
-    def _sample(
-        self,
-        input_ids: torch.LongTensor,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_warper: Optional[LogitsProcessorList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        streamer: Optional["BaseStreamer"] = None,
-        lazy_mode: Optional[bool] = False,
-        ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        hb_gen_time: Optional[HabanaGenerationtime] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **model_kwargs,
-    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
-        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`~generation.GenerationMixin._sample`] directly. Use generate() instead.
-        For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            logits_warper (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
-                to warp the prediction score distribution of the language modeling head applied before multinomial
-                sampling at each generation step.
-            max_length (`int`, *optional*, defaults to 20):
-                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
-                tokens. The maximum length of the sequence to be generated.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            output_logits (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
-                more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            streamer (`BaseStreamer`, *optional*):
-                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            ignore_eos (`bool`, *optional*, defaults to `False`):
-                Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            model_kwargs:
-                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
-                an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-
-        Examples:
-
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForCausalLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     TopKLogitsWarper,
-        ...     TemperatureLogitsWarper,
-        ...     StoppingCriteriaList,
-        ...     MaxLengthCriteria,
-        ... )
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-
-        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
-        >>> model.config.pad_token_id = model.config.eos_token_id
-        >>> model.generation_config.pad_token_id = model.config.eos_token_id
-
-        >>> input_prompt = "Today is a beautiful day, and"
-        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
-
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [
-        ...         MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
-        ...     ]
-        ... )
-        >>> # instantiate logits processors
-        >>> logits_warper = LogitsProcessorList(
-        ...     [
-        ...         TopKLogitsWarper(50),
-        ...         TemperatureLogitsWarper(0.7),
-        ...     ]
-        ... )
-
-        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
-
-        >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
-        >>> outputs = model._sample(
-        ...     input_ids,
-        ...     logits_processor=logits_processor,
-        ...     logits_warper=logits_warper,
-        ...     stopping_criteria=stopping_criteria,
-        ... )
-
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Today is a beautiful day, and we must do everything possible to make it a day of celebration.']
-        ```"""
-
-        # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        if max_length is not None:
-            warnings.warn(
-                (
-                    "`max_length` is deprecated in this function, use"
-                    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
-                ),
-                UserWarning,
-            )
-            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
-        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        if not self.generation_config.ignore_eos:
-            if eos_token_id is not None:
-                logger.warning_once(
-                    "`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
-                    " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
-                    " Otherwise make sure to set `model.generation_config.eos_token_id`",
-                    FutureWarning,
-                )
-                stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-            else:
-                # TODO remove when the method is totally private
-                # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
-                eos_token_id = [
-                    criteria.eos_token_id.tolist()
-                    for criteria in stopping_criteria
-                    if hasattr(criteria, "eos_token_id")
-                ]
-                eos_token_id = eos_token_id[0] if eos_token_id else None
-                if eos_token_id is None and self.generation_config.eos_token_id is not None:
-                    eos_token_id = self.generation_config.eos_token_id
-                    stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.generation_config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
-        )
-        return_dict_in_generate = (
-            return_dict_in_generate
-            if return_dict_in_generate is not None
-            else self.generation_config.return_dict_in_generate
-        )
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # keep track of which sequences are already finished
-        # TODO: no ignore_eos check here since there is a compilation error, will add ignore_eos here if fixed
-        batch_size, cur_len = input_ids.shape
-        if "inputs_embeds" in model_kwargs:
-            cur_len = model_kwargs["inputs_embeds"].shape[1]
-        this_peer_finished = False
-        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
-        model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
-
-        bucket_size = model_kwargs.get("bucket_size", -1)
-        prev_idx = -1  # avoiding calculate cache_idx when its value is not changing
-        bucket_internal = model_kwargs.get("bucket_internal", None)
-        reduce_recompile = model_kwargs.get("reduce_recompile", False)
-
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
-        )
-        hb_profer.start()
-
-        sample_first = True
-        if not bucket_internal:
-            if bucket_size >= 0:
-                inc = iter(incrementor(bucket_size, cur_len))
-            if bucket_size > 0:
-                assert "position_ids" not in model_kwargs, "Untested path"
-
-        token_idx = model_kwargs.get("token_idx", None)
-        if token_idx is not None:
-            # Update cur_len in case of static shapes
-            cur_len = token_idx.item()
-
-        # auto-regressive generation
-        time_to_first_token_done = False
-        model_kwargs["pad_done"] = False
-        model_kwargs["lazy_mode"] = lazy_mode
-        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            if lazy_mode:
-                self.htcore_generation.mark_step()
-
-            if bucket_size > 0 and not bucket_internal:
-                # it will not have been padded if bucket_size > 0
-                params = next(inc)
-                input_ids, model_kwargs = self.update_model_kwargs_for_bucketing(
-                    params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile
-                )
-
-            # prepare model inputs
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
-
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                **hpu_graphs_kwargs,
-            )
-
-            if synced_gpus and this_peer_finished:
-                continue  # don't waste resources running the code we don't need
-
-            token_idx = model_kwargs.get("token_idx", None)
-            if token_idx is not None and outputs.logits.shape[-2] > 1:
-                if model_kwargs.get("num_virtual_tokens", 0) > 0:
-                    # for prompt tuning, the output logit shape > model_inputs["input_ids"].shape[-1]
-                    if model_kwargs.get("reuse_cache", False):
-                        output_idx = torch.tensor(outputs.logits.shape[-2], device=input_ids.device)
-                    else:
-                        output_idx = token_idx + outputs.logits.shape[-2] - input_ids.shape[-1]
-                    next_token_logits = torch.index_select(outputs.logits, -2, output_idx - 1).squeeze(-2)
-                else:
-                    next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
-            else:
-                next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (next_token_scores,)
-                if output_logits:
-                    raw_logits += (next_token_logits,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
-
-            # sample
-            probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
-            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-
-            # finished sentences should have their next token be a padding token
-            # TODO: no ignore_eos check here since there is a compilation error, will add ignore_eos here if fixed
-            if eos_token_id is not None:
-                if pad_token_id is None:
-                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-
-            # update generated ids, model inputs, and length for next step
-            if not lazy_mode:
-                next_tokens = next_tokens.to(input_ids.dtype)
-
-            if token_idx is not None:
-                input_ids.index_copy_(
-                    1, token_idx, next_tokens.unsqueeze(-1) if next_tokens.dim() == 1 else next_tokens
-                )
-            else:
-                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            if streamer is not None:
-                streamer.put(next_tokens.cpu())
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
-                model_kwargs,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-            cur_len = cur_len + 1
-            if bucket_size > 0 and bucket_internal:
-                # Calculate slice idx for kv cache during the decode phase.
-                # Breaking down the kv cache in the attention block helps to reduce computation time.
-                if model_kwargs.get("token_idx_cpu") <= (model_kwargs["kv_cache_len"] // bucket_size) * bucket_size:
-                    idx = (model_kwargs.get("token_idx_cpu") - 1) // bucket_size
-                    if prev_idx != idx:
-                        model_kwargs["cache_idx"] = (idx + 1) * bucket_size
-                        prev_idx = idx
-                else:
-                    model_kwargs["cache_idx"] = model_kwargs["kv_cache_len"]
-
-            if ignore_eos:
-                this_peer_finished = stopping_criteria(
-                    input_ids, scores, token_idx=cur_len, ignore_eos=ignore_eos, eos_token_id=eos_token_id
-                )
-            else:
-                unfinished_sequences = unfinished_sequences & ~stopping_criteria(
-                    input_ids, scores, token_idx=cur_len, ignore_eos=ignore_eos, eos_token_id=eos_token_id
-                )
-                this_peer_finished = unfinished_sequences.max() == 0
-            hb_profer.step()
-            if hb_gen_time is not None:
-                if not time_to_first_token_done:
-                    time_to_first_token_done = True
-                    import habana_frameworks.torch.hpu as torch_hpu
-
-                    torch_hpu.synchronize()
-                hb_gen_time.step()
-
-            if (
-                not model_kwargs.get("pad_done", False)
-                and not model_kwargs.get("reuse_cache", False)
-                and bucket_internal
-            ):
-                # Pad the returned pask key values tensors from prefill phase forward run to maximum length
-                # before starting the decode phase.
-                self._pad_past_key_values(model_kwargs)
-                model_kwargs["pad_done"] = True
-
-            if sample_first:
-                import habana_frameworks.torch.hpu as torch_hpu
-
-                torch_hpu.synchronize()
-                print(f"First Token time(sample):{time.perf_counter()*1000}")
-                sample_first = False
-
-                hb_gen_time.step()
-
-        if (
-            model_kwargs.get("use_hpu_graphs", False)
-            and model_kwargs.get("limit_hpu_graphs", False)
-            and not model_kwargs.get("reuse_cache", False)
-            and bucket_internal
-        ):
-            # Clear HPU graphs input tensors of the decode phase after the full generation while loop
-            self.clear_inputs()
-            # Delete past key value tensors
-            self._remove_past_key_values(model_kwargs)
-
-        hb_profer.stop()
-        if streamer is not None:
-            streamer.end()
-
-        if return_dict_in_generate:
-            if self.config.is_encoder_decoder:
-                return GenerateEncoderDecoderOutput(
-                    sequences=input_ids,
-                    scores=scores,
-                    logits=raw_logits,
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-            else:
-                return GenerateDecoderOnlyOutput(
-                    sequences=input_ids,
-                    scores=scores,
-                    logits=raw_logits,
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-        else:
-            return input_ids
-
-    def _beam_search(
-        self,
-        input_ids: torch.LongTensor,
-        beam_scorer: BeamScorer,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        sequential: Optional[bool] = None,
-        lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        hb_gen_time: Optional[HabanaGenerationtime] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **model_kwargs,
-    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
-        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`~generation.GenerationMixin._beam_search`] directly. Use generate()
-        instead. For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            beam_scorer (`BeamScorer`):
-                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
-                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            max_length (`int`, *optional*, defaults to 20):
-                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
-                tokens. The maximum length of the sequence to be generated.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            output_logits (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
-                more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            sequential (`bool`, defaults to `False`):
-                By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for
-                more details). This flag will avoid parallelizing the beam search and will instead run beam search
-                sequentially.
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            model_kwargs:
-                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
-                an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`transformers.generation.utils.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-
-        Examples:
-
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForSeq2SeqLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     BeamSearchScorer,
-        ... )
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        >>> encoder_input_str = "translate English to German: How old are you?"
-        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-        >>> # lets run beam search using 3 beams
-        >>> num_beams = 3
-        >>> # define decoder start token ids
-        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-        >>> input_ids = input_ids * model.config.decoder_start_token_id
-
-        >>> # add encoder_outputs to model keyword arguments
-        >>> model_kwargs = {
-        ...     "encoder_outputs": model.get_encoder()(
-        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
-        ...     )
-        ... }
-
-        >>> # instantiate beam scorer
-        >>> beam_scorer = BeamSearchScorer(
-        ...     batch_size=1,
-        ...     num_beams=num_beams,
-        ...     device=model.device,
-        ... )
-
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [
-        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-        ...     ]
-        ... )
-
-        >>> outputs = model._beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
-
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Wie alt bist du?']
-        ```"""
-        # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        sequential = sequential if sequential is not None else self.generation_config.low_memory
-        if max_length is not None:
-            warnings.warn(
-                (
-                    "`max_length` is deprecated in this function, use"
-                    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
-                ),
-                UserWarning,
-            )
-            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
-        if len(stopping_criteria) == 0:
-            warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        if not self.generation_config.ignore_eos:
-            if eos_token_id is not None:
-                logger.warning_once(
-                    "`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
-                    " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
-                    " Otherwise make sure to set `model.generation_config.eos_token_id`",
-                    FutureWarning,
-                )
-                stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-            else:
-                # TODO remove when the method is totally private and beam scorer refactored
-                # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
-                eos_token_id = [
-                    criteria.eos_token_id.tolist()
-                    for criteria in stopping_criteria
-                    if hasattr(criteria, "eos_token_id")
-                ]
-                eos_token_id = eos_token_id[0] if eos_token_id else None
-                if eos_token_id is None and self.generation_config.eos_token_id is not None:
-                    eos_token_id = self.generation_config.eos_token_id
-                    stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.generation_config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
-        )
-        return_dict_in_generate = (
-            return_dict_in_generate
-            if return_dict_in_generate is not None
-            else self.generation_config.return_dict_in_generate
-        )
-
-        batch_size = len(beam_scorer._beam_hyps)
-        num_beams = beam_scorer.num_beams
-
-        batch_beam_size, cur_len = input_ids.shape
-        if "inputs_embeds" in model_kwargs:
-            cur_len = model_kwargs["inputs_embeds"].shape[1]
-        token_idx = model_kwargs.get("token_idx", None)
-        if token_idx is not None:
-            # Update cur_len in case of static shapes
-            cur_len = token_idx.item()
-        model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
-
-        if num_beams * batch_size != batch_beam_size:
-            raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-            )
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        beam_indices = (
-            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
-        )
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
-        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view((batch_size * num_beams,))
-
-        if self.generation_config.static_shapes:
-            beam_trace_scores = torch.zeros(
-                (input_ids.shape[1], 2 * batch_size * num_beams), device=input_ids.device, dtype=torch.float32
-            )
-            beam_trace_indices = torch.zeros(
-                (input_ids.shape[1], 2 * batch_size * num_beams), device=input_ids.device, dtype=torch.int64
-            )
-            beam_trace_tokens = torch.zeros(
-                (input_ids.shape[1], 2 * batch_size * num_beams), device=input_ids.device, dtype=torch.int64
-            )
-            beam_trace_idx = torch.tensor(0, device=input_ids.device)
-            num_eos_tokens = torch.zeros((1), device=input_ids.device, dtype=torch.int64)
-            num_beams_tensor = torch.tensor(num_beams, device=input_ids.device, dtype=torch.int64)
-
-        def finalize_beams(initial_ids, beam_trace, model_config, length_penalty):
-            beam_trace_idx, beam_trace_scores, beam_trace_indices, beam_trace_tokens = beam_trace
-            bs = initial_ids.shape[0]
-            num_beams = beam_trace_scores.shape[1] // (2 * bs)
-
-            beam_trace_idx = beam_trace_idx.item()
-            beam_trace_scores = beam_trace_scores[:beam_trace_idx, :]
-            beam_trace_indices = beam_trace_indices[:beam_trace_idx, :]
-            beam_trace_tokens = beam_trace_tokens[:beam_trace_idx, :]
-
-            # (score, parent_beam, token_id, is_finished)
-            root = (float("-inf"), None, None, False)
-
-            def resolve_beam(beam):
-                if beam == root:
-                    return []
-                score, prev, tok, is_finished = beam
-                rest = resolve_beam(prev)
-                rest.append(tok)
-                return rest
-
-            prev_beams = [[root] * num_beams] * bs
-            best = [root] * bs
-
-            def beam_score(beam):
-                return (beam[3], beam[0])
-
-            for step, (scores, indices, tokens) in enumerate(
-                zip(beam_trace_scores, beam_trace_indices, beam_trace_tokens)
-            ):
-                cur_beams = [[] for _ in range(bs)]
-                for idx, (s, i, t) in enumerate(zip(scores, indices, tokens)):
-                    batch = idx // (num_beams * 2)
-                    idx = idx % (num_beams * 2)
-                    b_len = 1 + step
-                    b_score = s.item() / (b_len**length_penalty)
-                    b_tok = t.item()
-                    is_finished = b_tok == model_config.eos_token_id
-                    if len(cur_beams[batch]) >= num_beams:
-                        continue
-                    beam = (b_score, prev_beams[batch][i], b_tok, is_finished)
-                    if not is_finished:
-                        cur_beams[batch].append(beam)
-                    if is_finished or (step + 1 == beam_trace_idx):
-                        if beam_score(best[batch]) < beam_score(beam):
-                            best[batch] = beam
-                prev_beams = cur_beams
-
-            def expand_if_needed(tensor, new_size, value, dim=-1):
-                orig_len = tensor.shape[dim]
-                padding_len = new_size - orig_len
-                import torch.nn.functional as F
-
-                if padding_len > 0:
-                    if dim == -1:
-                        return F.pad(tensor, (0, padding_len), value=value)
-                    elif dim == -2:
-                        return F.pad(tensor, (0, 0, 0, padding_len), value=value)
-                    else:
-                        assert False, f"Unsupported dim value: {dim}"
-                return tensor
-
-            result = [
-                torch.cat(
-                    [initial_ids[i], torch.tensor(resolve_beam(b), dtype=initial_ids.dtype, device=initial_ids.device)]
-                )
-                for i, b in enumerate(best)
-            ]
-            max_length = max([t.shape[-1] for t in result])
-            result = [expand_if_needed(res, max_length, model_config.pad_token_id) for res in result]
-            input_ids = torch.stack(result)
-            return input_ids
-
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
-        )
-        hb_profer.start()
-        this_peer_finished = False
-
-        bucket_size = model_kwargs.get("bucket_size", -1)
-        reduce_recompile = model_kwargs.get("reduce_recompile", False)
-        prompt_len = input_ids.shape[-1]
-        if bucket_size >= 0:
-            inc = iter(incrementor(bucket_size, prompt_len))
-        if bucket_size > 0:
-            assert "position_ids" not in model_kwargs, "Untested path"
-        if self.generation_config.static_shapes:
-            initial_ids = input_ids[::num_beams, 0:cur_len]
-
-        time_to_first_token_done = False
-        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            if lazy_mode:
-                self.htcore_generation.mark_step()
-
-            if bucket_size > 0:
-                # it will not have been padded if bucket_size > 0
-                params = next(inc)
-                input_ids, model_kwargs = self.update_model_kwargs_for_bucketing(
-                    params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile
-                )
-
-            model_kwargs["lazy_mode"] = lazy_mode
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            # if sequential is True, split the input to batches of batch_size and run sequentially
-            if sequential:
-                if any(
-                    model_name in self.__class__.__name__.lower()
-                    for model_name in [
-                        "fsmt",
-                        "reformer",
-                        "bloom",
-                        "ctrl",
-                        "gpt_bigcode",
-                        "transo_xl",
-                        "xlnet",
-                        "cpm",
-                        "jamba",
-                    ]
-                ):
-                    raise RuntimeError(
-                        f"Currently generation for {self.__class__.__name__} is not supported "
-                        f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature."
-                    )
-
-                inputs_per_sub_batches = _split_model_inputs(
-                    model_inputs, split_size=batch_size, full_batch_size=batch_beam_size
-                )
-                outputs_per_sub_batch = [
-                    self(
-                        **inputs_per_sub_batch,
-                        return_dict=True,
-                        output_attentions=output_attentions,
-                        output_hidden_states=output_hidden_states,
-                    )
-                    for inputs_per_sub_batch in inputs_per_sub_batches
-                ]
-
-                outputs = stack_model_outputs(outputs_per_sub_batch)
-            else:
-                hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
-                outputs = self(
-                    **model_inputs,
-                    return_dict=True,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    **hpu_graphs_kwargs,
-                )
-
-            if synced_gpus and this_peer_finished:
-                cur_len = cur_len + 1
-                continue  # don't waste resources running the code we don't need
-
-            token_idx = model_kwargs.get("token_idx", None)
-            if token_idx is not None and outputs.logits.shape[-2] > 1:
-                if model_kwargs.get("num_virtual_tokens", 0) > 0:
-                    # for prompt tuning, the output logit shape may > model_inputs["input_ids"].shape[-1]
-                    if model_kwargs.get("reuse_cache", False):
-                        output_idx = torch.tensor(outputs.logits.shape[-2], device=input_ids.device)
-                    else:
-                        output_idx = token_idx + outputs.logits.shape[-2] - input_ids.shape[-1]
-                    next_token_logits = torch.index_select(outputs.logits, -2, output_idx - 1).squeeze(-2)
-                else:
-                    next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
-            else:
-                next_token_logits = outputs.logits[:, -1, :]
-
-            next_token_scores = torch.nn.functional.log_softmax(
-                next_token_logits, dim=-1
-            )  # (batch_size * num_beams, vocab_size)
-
-            if token_idx is not None:
-                next_token_scores_processed = logits_processor(input_ids[:, :token_idx], next_token_scores)
-            else:
-                next_token_scores_processed = logits_processor(input_ids, next_token_scores)
-            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
-                next_token_scores_processed
-            )
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (next_token_scores_processed,)
-                if output_logits:
-                    raw_logits += (next_token_logits,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
-
-            # reshape for beam search
-            vocab_size = next_token_scores.shape[-1]
-            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
-
-            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
-            n_eos_tokens = len(eos_token_id) if eos_token_id else 0
-            next_token_scores, next_tokens = torch.topk(
-                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
-            )
-
-            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
-            if self.generation_config.static_shapes:
-                beam_scores = next_token_scores.flatten()
-                static_beam_indices = next_indices.flatten()
-
-                beam_tokens = next_tokens.remainder(vocab_size).flatten()
-
-                beam_trace_scores.index_copy_(0, beam_trace_idx, beam_scores.unsqueeze(0))
-                beam_trace_indices.index_copy_(0, beam_trace_idx, static_beam_indices.unsqueeze(0))
-                beam_trace_tokens.index_copy_(0, beam_trace_idx, beam_tokens.unsqueeze(0))
-                beam_trace_idx.add_(1)
-
-                if self.generation_config.early_stopping:
-                    num_eos_tokens.add_(beam_tokens[0:num_beams].eq(self.config.eos_token_id).sum())
-
-                beam_scores.add_(torch.where(beam_tokens.eq(self.config.eos_token_id), float("-inf"), 0.0))
-                beam_scores = beam_scores.view(batch_size, -1).unsqueeze(0)
-                _, selected = torch.topk(beam_scores, k=num_beams, dim=-1, largest=True, sorted=True)
-                offset = torch.arange(0, torch.numel(beam_scores), beam_scores.shape[-1]).unsqueeze(-1)
-                selected = (selected + offset).flatten()
-                beam_scores = beam_scores.flatten().index_select(0, selected)
-                beam_tokens = beam_tokens.index_select(0, selected)
-                static_beam_indices = static_beam_indices.index_select(0, selected)
-
-                prev_beams = outputs.logits.shape[0] // batch_size
-
-                beam_offsets = torch.arange(0, 1, prev_beams, dtype=torch.int32)
-                beam_offsets = beam_offsets.to(device=outputs.logits.device)
-                static_beam_indices = (static_beam_indices.view(batch_size, -1) + beam_offsets.unsqueeze(-1)).flatten()
-
-                next_tokens = beam_tokens.unsqueeze(-1)
-                beam_next_tokens = next_tokens
-                beam_idx = static_beam_indices
-            else:
-                next_tokens = next_tokens % vocab_size
-                # stateless
-                beam_outputs = beam_scorer.process(
-                    input_ids,
-                    next_token_scores,
-                    next_tokens,
-                    next_indices,
-                    pad_token_id=pad_token_id,
-                    eos_token_id=eos_token_id,
-                    beam_indices=beam_indices,
-                    decoder_prompt_len=prompt_len,
-                )
-                beam_scores = beam_outputs["next_beam_scores"]
-                beam_next_tokens = beam_outputs["next_beam_tokens"]
-                beam_idx = beam_outputs["next_beam_indices"]
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 0, beam_idx)
-                input_ids.index_copy_(
-                    1, token_idx, beam_next_tokens.unsqueeze(-1) if beam_next_tokens.dim() == 1 else beam_next_tokens
-                )
-            else:
-                input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
-                model_kwargs,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-            if model_kwargs.get("past_key_values", None) is not None:
-                if model_kwargs["reuse_cache"]:
-                    model_kwargs["past_key_values"] = unwrap_deepspeed_model(self).reorder_kv_cache(beam_idx)
-                else:
-                    model_kwargs["past_key_values"] = self._temporary_reorder_cache(
-                        model_kwargs["past_key_values"], beam_idx
-                    )
-
-            if return_dict_in_generate and output_scores:
-                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
-
-            # increase cur_len
-            cur_len = cur_len + 1
-
-            hb_profer.step()
-            if self.generation_config.static_shapes:
-                is_min_length_reached = (
-                    self.generation_config.min_length and cur_len >= self.generation_config.min_length
-                )
-                if (
-                    self.generation_config.early_stopping
-                    and is_min_length_reached
-                    and num_eos_tokens >= num_beams_tensor
-                ):
-                    break
-                elif get_final_stopping_criteria(stopping_criteria(input_ids, scores, token_idx=cur_len)):
-                    break
-            elif get_final_stopping_criteria(stopping_criteria(input_ids, scores)) or (
-                beam_scorer.is_done and not lazy_mode
-            ):
-                this_peer_finished = True
-
-            if hb_gen_time is not None:
-                if not time_to_first_token_done:
-                    time_to_first_token_done = True
-                    import habana_frameworks.torch.hpu as torch_hpu
-
-                    torch_hpu.synchronize()
-                hb_gen_time.step()
-        hb_profer.stop()
-
-        if self.generation_config.static_shapes:
-            beam_trace = (beam_trace_idx, beam_trace_scores, beam_trace_indices, beam_trace_tokens)
-            from collections import UserDict
-
-            def map_tensors(obj, fn):
-                constructor = type(obj)
-                if isinstance(obj, tuple):
-                    return constructor(map_tensors(v, fn) for v in obj)
-                if isinstance(obj, list):
-                    return constructor([map_tensors(v, fn) for v in obj])
-                if isinstance(obj, dict) or isinstance(obj, UserDict):
-                    return constructor({k: map_tensors(v, fn) for k, v in obj.items()})
-                if isinstance(obj, torch.Tensor):
-                    return fn(obj)
-                return obj
-
-            def move(obj, device):
-                return map_tensors(obj, lambda t: t.to(device))
-
-            sequence_outputs = {}
-            sequence_outputs["sequences"] = finalize_beams(
-                initial_ids.cpu(), move(beam_trace, "cpu"), self.config, self.generation_config.length_penalty
-            )
-        else:
-            sequence_outputs = beam_scorer.finalize(
-                input_ids,
-                beam_scores,
-                next_tokens,
-                beam_indices,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                max_length=stopping_criteria.max_length,
-                beam_indices=beam_indices,
-                decoder_prompt_len=prompt_len,
-            )
-
-        if return_dict_in_generate:
-            if not output_scores:
-                sequence_outputs["sequence_scores"] = None
-
-            if self.config.is_encoder_decoder:
-                return GenerateBeamEncoderDecoderOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-            else:
-                return GenerateBeamDecoderOnlyOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-        else:
-            return sequence_outputs["sequences"]
-
-    def _beam_sample(
-        self,
-        input_ids: torch.LongTensor,
-        beam_scorer: BeamScorer,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_warper: Optional[LogitsProcessorList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        hb_gen_time: Optional[HabanaGenerationtime] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **model_kwargs,
-    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **beam search multinomial
-        sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`~generation._GenerationMixin.beam_sample`] directly. Use generate()
-        instead. For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            beam_scorer (`BeamScorer`):
-                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
-                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            logits_warper (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
-                to warp the prediction score distribution of the language modeling head applied before multinomial
-                sampling at each generation step.
-            max_length (`int`, *optional*, defaults to 20):
-                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
-                tokens. The maximum length of the sequence to be generated.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            output_logits (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
-                more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            model_kwargs:
-                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
-                an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-
-        Examples:
-
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForSeq2SeqLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     TopKLogitsWarper,
-        ...     TemperatureLogitsWarper,
-        ...     BeamSearchScorer,
-        ... )
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        >>> encoder_input_str = "translate English to German: How old are you?"
-        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-        >>> # lets run beam search using 3 beams
-        >>> num_beams = 3
-        >>> # define decoder start token ids
-        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-        >>> input_ids = input_ids * model.config.decoder_start_token_id
-
-        >>> # add encoder_outputs to model keyword arguments
-        >>> model_kwargs = {
-        ...     "encoder_outputs": model.get_encoder()(
-        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
-        ...     )
-        ... }
-
-        >>> # instantiate beam scorer
-        >>> beam_scorer = BeamSearchScorer(
-        ...     batch_size=1,
-        ...     max_length=model.config.max_length,
-        ...     num_beams=num_beams,
-        ...     device=model.device,
-        ... )
-
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)]
-        ... )
-        >>> # instantiate logits processors
-        >>> logits_warper = LogitsProcessorList(
-        ...     [
-        ...         TopKLogitsWarper(50),
-        ...         TemperatureLogitsWarper(0.7),
-        ...     ]
-        ... )
-
-        >>> outputs = model._beam_sample(
-        ...     input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
-        ... )
-
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Wie alt bist du?']
-        ```"""
-
-        raise NotImplementedError("Beam search sampling is not supported by optimum-habana yet.")
-
-    def _group_beam_search(
-        self,
-        input_ids: torch.LongTensor,
-        beam_scorer: BeamScorer,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        hb_gen_time: Optional[HabanaGenerationtime] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **model_kwargs,
-    ):
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **diverse beam search
-        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`~generation.GenerationMixin._group_beam_search`] directly. Use
-        generate() instead. For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            beam_scorer (`BeamScorer`):
-                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
-                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            max_length (`int`, *optional*, defaults to 20):
-                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
-                tokens. The maximum length of the sequence to be generated.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            output_logits (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
-                more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            model_kwargs:
-                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
-                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if [`transformers.generation.BeamSearchDecoderOnlyOutput`] if
-            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
-            [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`.
-
-        Examples:
-
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForSeq2SeqLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     HammingDiversityLogitsProcessor,
-        ...     BeamSearchScorer,
-        ... )
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        >>> encoder_input_str = "translate English to German: How old are you?"
-        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-        >>> # lets run diverse beam search using 6 beams
-        >>> num_beams = 6
-        >>> # define decoder start token ids
-        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-        >>> input_ids = input_ids * model.config.decoder_start_token_id
-
-        >>> # add encoder_outputs to model keyword arguments
-        >>> model_kwargs = {
-        ...     "encoder_outputs": model.get_encoder()(
-        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
-        ...     )
-        ... }
-
-        >>> # instantiate beam scorer
-        >>> beam_scorer = BeamSearchScorer(
-        ...     batch_size=1,
-        ...     max_length=model.config.max_length,
-        ...     num_beams=num_beams,
-        ...     device=model.device,
-        ...     num_beam_groups=3,
-        ... )
-
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [
-        ...         HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
-        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-        ...     ]
-        ... )
-
-        >>> outputs = model._group_beam_search(
-        ...     input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs
-        ... )
-
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Wie alt bist du?']
-        ```"""
-
-        raise NotImplementedError("Group beam search is not supported by optimum-habana yet.")
-
-    def _constrained_beam_search(
-        self,
-        input_ids: torch.LongTensor,
-        constrained_beam_scorer: ConstrainedBeamSearchScorer,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = None,
-        lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        hb_gen_time: Optional[HabanaGenerationtime] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **model_kwargs,
-    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **constrained beam search
-        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`~generation.GenerationMixin._constrained_beam_search`] directly. Use
-        generate() instead. For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
-                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
-                sorted during generation, while satisfying a list of positive constraints. For more information, the
-                documentation of [`ConstrainedBeamSearchScorer`] should be read.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            logits_warper (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
-                to warp the prediction score distribution of the language modeling head applied before multinomial
-                sampling at each generation step.
-            max_length (`int`, *optional*, defaults to 20):
-                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
-                tokens. The maximum length of the sequence to be generated.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            output_logits (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
-                more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            model_kwargs:
-                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
-                an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`transformers.generation.utils.GenerateBeamDecoderOnlyOutput`], [`transformers.generation.GenerateBeamEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`transformers.generation.GenerateBeamEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-
-        Examples:
-
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForSeq2SeqLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     ConstrainedBeamSearchScorer,
-        ...     PhrasalConstraint,
-        ... )
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        >>> encoder_input_str = "translate English to German: How old are you?"
-        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-        >>> # lets run beam search using 3 beams
-        >>> num_beams = 3
-        >>> # define decoder start token ids
-        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-        >>> input_ids = input_ids * model.config.decoder_start_token_id
-
-        >>> # add encoder_outputs to model keyword arguments
-        >>> model_kwargs = {
-        ...     "encoder_outputs": model.get_encoder()(
-        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
-        ...     )
-        ... }
-
-        >>> constraint_str = "Sie"
-        >>> constraint_token_ids = tokenizer.encode(constraint_str)[:-1]  # slice to remove eos token
-        >>> constraints = [PhrasalConstraint(token_ids=constraint_token_ids)]
-
-        >>> # instantiate beam scorer
-        >>> beam_scorer = ConstrainedBeamSearchScorer(
-        ...     batch_size=1, num_beams=num_beams, device=model.device, constraints=constraints
-        ... )
-
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [
-        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-        ...     ]
-        ... )
-
-        >>> outputs = model._constrained_beam_search(
-        ...     input_ids, beam_scorer, constraints=constraints, logits_processor=logits_processor, **model_kwargs
-        ... )
-
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ['Wie alt sind Sie?']
-        ```"""
-
-        # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        if max_length is not None:
-            warnings.warn(
-                "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
-                UserWarning,
-            )
-            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
-        if len(stopping_criteria) == 0:
-            warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        if not self.generation_config.ignore_eos:
-            if eos_token_id is not None:
-                logger.warning_once(
-                    "`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
-                    " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
-                    " Otherwise make sure to set `model.generation_config.eos_token_id`",
-                    FutureWarning,
-                )
-                stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-            else:
-                # TODO remove when the method is totally private and beam scorer refactored
-                # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
-                eos_token_id = [
-                    criteria.eos_token_id.tolist()
-                    for criteria in stopping_criteria
-                    if hasattr(criteria, "eos_token_id")
-                ]
-                eos_token_id = eos_token_id[0] if eos_token_id else None
-                if eos_token_id is None and self.generation_config.eos_token_id is not None:
-                    eos_token_id = self.generation_config.eos_token_id
-                    stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.generation_config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
-        )
-        return_dict_in_generate = (
-            return_dict_in_generate
-            if return_dict_in_generate is not None
-            else self.generation_config.return_dict_in_generate
-        )
-
-        batch_size = len(constrained_beam_scorer._beam_hyps)
-        num_beams = constrained_beam_scorer.num_beams
-
-        batch_beam_size, cur_len = input_ids.shape
-        if "inputs_embeds" in model_kwargs:
-            cur_len = model_kwargs["inputs_embeds"].shape[1]
-        token_idx = model_kwargs.get("token_idx", None)
-        if token_idx is not None:
-            # Update cur_len in case of static shapes
-            cur_len = token_idx.item()
-        model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
-
-        if num_beams * batch_size != batch_beam_size:
-            raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-            )
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        beam_indices = (
-            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
-        )
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
-        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view((batch_size * num_beams,))
-
-        this_peer_finished = False
-
-        decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
-
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
-        )
-        hb_profer.start()
-
-        time_to_first_token_done = False
-        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            model_kwargs["lazy_mode"] = lazy_mode
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
-
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                **hpu_graphs_kwargs,
-            )
-
-            if synced_gpus and this_peer_finished:
-                cur_len = cur_len + 1
-                continue  # don't waste resources running the code we don't need
-            if token_idx is not None and outputs.logits.shape[-2] > 1:
-                if model_kwargs.get("num_virtual_tokens", 0) > 0:
-                    # for prompt tuning, the output logit shape > model_inputs["input_ids"].shape[-1]
-                    if model_kwargs.get("reuse_cache", False):
-                        output_idx = torch.tensor(outputs.logits.shape[-2], device=input_ids.device)
-                    else:
-                        output_idx = token_idx + outputs.logits.shape[-2] - input_ids.shape[-1]
-                    next_token_logits = torch.index_select(outputs.logits, -2, output_idx - 1).squeeze(-2)
-                else:
-                    next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
-            else:
-                next_token_logits = outputs.logits[:, -1, :]
-
-            next_token_scores = torch.nn.functional.log_softmax(
-                next_token_logits, dim=-1
-            )  # (batch_size * num_beams, vocab_size)
-
-            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
-
-            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
-                next_token_scores_processed
-            )
-
-            scores_for_all_vocab = next_token_scores.clone()
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (next_token_scores,)
-                if output_logits:
-                    raw_logits += (next_token_logits,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
-
-            # reshape for beam search
-            vocab_size = next_token_scores.shape[-1]
-            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
-
-            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
-            n_eos_tokens = len(eos_token_id) if eos_token_id else 0
-            next_token_scores, next_tokens = torch.topk(
-                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
-            )
-
-            next_indices = (next_tokens / vocab_size).long()
-            next_tokens = next_tokens % vocab_size
-
-            # stateless
-            beam_outputs = constrained_beam_scorer.process(
-                input_ids[:, :cur_len],
-                next_token_scores,
-                next_tokens,
-                next_indices,
-                scores_for_all_vocab,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                beam_indices=beam_indices,
-                decoder_prompt_len=decoder_prompt_len,
-            )
-            beam_scores = beam_outputs["next_beam_scores"]
-            beam_next_tokens = beam_outputs["next_beam_tokens"]
-            beam_idx = beam_outputs["next_beam_indices"]
-
-            if token_idx is not None:
-                input_ids = input_ids[beam_idx, :]
-                input_ids.index_copy_(
-                    1, token_idx, beam_next_tokens.unsqueeze(-1) if beam_next_tokens.dim() == 1 else beam_next_tokens
-                )
-            else:
-                input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
-                model_kwargs,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-            if model_kwargs.get("past_key_values", None) is not None:
-                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
-                    model_kwargs["past_key_values"], beam_idx
-                )
-
-            if return_dict_in_generate and output_scores:
-                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
-
-            # increase cur_len
-            cur_len = cur_len + 1
-
-            hb_profer.step()
-
-            if constrained_beam_scorer.is_done or get_final_stopping_criteria(
-                stopping_criteria(input_ids, scores, token_idx=cur_len)
-            ):
-                this_peer_finished = True
-
-            if hb_gen_time is not None:
-                if not time_to_first_token_done:
-                    time_to_first_token_done = True
-                    import habana_frameworks.torch.hpu as torch_hpu
-
-                    torch_hpu.synchronize()
-                hb_gen_time.step()
-
-        hb_profer.stop()
-        sequence_outputs = constrained_beam_scorer.finalize(
-            input_ids,
-            beam_scores,
-            next_tokens,
-            next_indices,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            max_length=stopping_criteria.max_length,
-            beam_indices=beam_indices,
-            decoder_prompt_len=decoder_prompt_len,
-        )
-
-        if return_dict_in_generate:
-            if not output_scores:
-                sequence_outputs["sequence_scores"] = None
-            if self.config.is_encoder_decoder:
-                return GenerateBeamEncoderDecoderOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-            else:
-                return GenerateBeamDecoderOnlyOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-        else:
-            return sequence_outputs["sequences"]
-
-    def _assisted_decoding(
-        self,
-        input_ids: torch.LongTensor,
-        candidate_generator: Optional["GaudiCandidateGenerator"] = None,
-        do_sample: bool = False,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        logits_warper: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        streamer: Optional["BaseStreamer"] = None,
-        lazy_mode: Optional[bool] = False,
-        ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
-        hb_gen_time: Optional[HabanaGenerationtime] = None,
-        profiling_record_shapes: Optional[bool] = False,
-        **model_kwargs,
-    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
-        **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
-        candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
-        models.
-
-        <Tip warning={true}>
-
-        In most cases, you do not need to call [`transformers.generation.GenerationMixin._assisted_decoding`] directly. Use
-        generate() instead. For an overview of generation strategies and code examples, check the [following
-        guide](../generation_strategies).
-
-        </Tip>
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            candidate_generator (`CandidateGenerator`, *optional*):
-                A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
-                more information, the documentation of [`CandidateGenerator`] should be read.
-            do_sample (`bool`, *optional*, defaults to `False`):
-                Whether or not to use sampling ; use greedy decoding otherwise.
-            logits_processor (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            logits_warper (`LogitsProcessorList`, *optional*):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
-                to warp the prediction score distribution of the language modeling head applied before multinomial
-                sampling at each generation step.
-            stopping_criteria (`StoppingCriteriaList`, *optional*):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            pad_token_id (`int`, *optional*):
-                The id of the *padding* token.
-            eos_token_id (`Union[int, List[int]]`, *optional*):
-                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more details.
-            output_scores (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            output_logits (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
-                more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-            synced_gpus (`bool`, *optional*, defaults to `False`):
-                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-            streamer (`BaseStreamer`, *optional*):
-                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-            lazy_mode (`bool`, *optional*, defaults to `False`):
-                Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
-            model_kwargs:
-                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
-                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`transformers.generation.GenerateDecoderOnlyOutput`], [`transformers.generation.GenerateEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`transformers.generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`transformers.generation.GenerateEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-
-        Examples:
-
-        ```python
-        >>> from transformers import (
-        ...     AutoTokenizer,
-        ...     AutoModelForCausalLM,
-        ...     LogitsProcessorList,
-        ...     MinLengthLogitsProcessor,
-        ...     StoppingCriteriaList,
-        ...     MaxLengthCriteria,
-        ... )
-        >>> from transformers.generation import AssistedCandidateGenerator
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-        >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
-        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
-        >>> input_prompt = "It might be possible to"
-        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
-        >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList(
-        ...     [
-        ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
-        ...     ]
-        ... )
-        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
-        >>> candidate_generator = AssistedCandidateGenerator(
-        ...     input_ids=input_ids,
-        ...     assistant_model=assistant_model,
-        ...     generation_config=model.generation_config,
-        ...     logits_processor=logits_processor,
-        ...     model_kwargs={},
-        ... )
-        >>> outputs = model._assisted_decoding(
-        ...     input_ids,
-        ...     candidate_generator=candidate_generator,
-        ...     logits_processor=logits_processor,
-        ...     stopping_criteria=stopping_criteria,
-        ... )
-        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
-        ```"""
-        # init values
-        # do_sample = logits_warper is not None
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        if eos_token_id is not None:
-            logger.warning_once(
-                "`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
-                " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
-                " Otherwise make sure to set `model.generation_config.eos_token_id`",
-                FutureWarning,
-            )
-            stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-        else:
-            # TODO remove when the method is totally private and beam scorer refactored
-            # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
-            eos_token_id = [
-                criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
-            ]
-            eos_token_id = eos_token_id[0] if eos_token_id else None
-            if eos_token_id is None and self.generation_config.eos_token_id is not None:
-                eos_token_id = self.generation_config.eos_token_id
-                stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
-        output_attentions = (
-            output_attentions if output_attentions is not None else self.generation_config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
-        )
-        return_dict_in_generate = (
-            return_dict_in_generate
-            if return_dict_in_generate is not None
-            else self.generation_config.return_dict_in_generate
-        )
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # keep track of which sequences are already finished
-        batch_size, cur_len = input_ids.shape
-        if "inputs_embeds" in model_kwargs:
-            cur_len = model_kwargs["inputs_embeds"].shape[1]
-        if not ignore_eos:
-            unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
-        model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
-
-        hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
-        hb_profer.start()
-        this_peer_finished = False
-
-        token_idx = model_kwargs.get("token_idx", None)
-        time_to_first_token_done = False
-        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            if lazy_mode:
-                self.htcore_generation.mark_step()
-
-            if token_idx is not None:
-                # Update cur_len in case of static shapes
-                cur_len = token_idx.item()
-            else:
-                cur_len = input_ids.shape[-1]
-
-            # prepare model inputs
-            model_kwargs["lazy_mode"] = lazy_mode
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            #  1. Fetch candidate sequences from a `CandidateGenerator`
-
-            candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids[:, :cur_len])
-            candidate_input_ids = candidate_input_ids.to(self.device)
-            if candidate_logits is not None:
-                candidate_logits = candidate_logits.to(self.device)
-
-            if self.generation_config.static_shapes:
-                candidate_length = candidate_input_ids.shape[1] - cur_len
-            else:
-                candidate_length = candidate_input_ids.shape[1] - input_ids.shape[1]
-            is_done_candidate = stopping_criteria(candidate_input_ids, None)
-
-            # 2. Use the original model to obtain the next token logits given the candidate sequence. We obtain
-            # `candidate_length + 1` relevant logits from this process: in the event that all candidates are correct,
-            # we use this forward pass to also pick the subsequent logits in the original model.
-
-            # 2.1. Prepare the model inputs
-            model_kwargs = _prepare_attention_mask(
-                model_kwargs, candidate_input_ids.shape[1], self.config.is_encoder_decoder
-            )
-            model_kwargs = _prepare_token_type_ids(model_kwargs, candidate_input_ids.shape[1])
-            if "cache_position" in model_kwargs:
-                model_kwargs["cache_position"] = torch.cat(
-                    (
-                        model_kwargs["cache_position"],
-                        torch.arange(cur_len, cur_len + candidate_length, device=input_ids.device, dtype=torch.long),
-                    ),
-                    dim=0,
-                )
-
-            model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **model_kwargs)
-            if "num_logits_to_keep" in model_inputs:
-                model_inputs["num_logits_to_keep"] = candidate_length + 1
-
-            hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs)
-
-            # 2.2. Run a forward pass on the candidate sequence
-            outputs = self(
-                **model_inputs,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                **hpu_graphs_kwargs,
-            )
-
-            # 2.3. Process the new logits
-            new_logits = outputs.logits[:, -candidate_length - 1 :]  # excludes the input prompt if present
-            next_token_logits = new_logits.clone()
-            if len(logits_processor) > 0:
-                for i in range(candidate_length + 1):
-                    new_logits[:, i, :] = logits_processor(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :])
-            if do_sample and len(logits_warper) > 0:
-                for i in range(candidate_length + 1):
-                    new_logits[:, i, :] = logits_warper(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :])
-
-            # 3. Select the accepted tokens. There are two possible cases:
-            # Case 1: `do_sample=True` and we have logits for the candidates (originally from speculative decoding)
-            # 👉 Apply algorithm 1 from the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf).
-            if do_sample and candidate_logits is not None:
-                from transformers.generation.utils import _speculative_sampling
-
-                valid_tokens, n_matches = _speculative_sampling(
-                    candidate_input_ids,
-                    candidate_logits,
-                    candidate_length,
-                    new_logits,
-                    is_done_candidate,
-                )
-
-            # Case 2: all other cases (originally from assisted generation) 👉 Compare the tokens selected from the
-            # original model logits with the candidate tokens. We can keep the candidate tokens until the first
-            # mismatch, or until the max length is reached.
-            else:
-                if do_sample:
-                    probs = new_logits.softmax(dim=-1)
-                    selected_tokens = torch.multinomial(probs[0, :, :], num_samples=1).squeeze(1)[None, :]
-                else:
-                    selected_tokens = new_logits.argmax(dim=-1)
-
-                candidate_new_tokens = candidate_input_ids[:, cur_len:]
-                n_matches = ((~(candidate_new_tokens == selected_tokens[:, :-1])).cumsum(dim=-1) < 1).sum()
-
-                # Ensure we don't generate beyond max_len or an EOS token
-                if is_done_candidate and n_matches == candidate_length:
-                    n_matches -= 1
-                valid_tokens = selected_tokens[:, : n_matches + 1]
-
-            # 4. Update variables according to the number of matching assistant tokens. Remember: the token generated
-            # by the model after the last candidate match is also valid, as it is generated from a correct sequence.
-            # Because of this last token, assisted generation search reduces to a normal greedy search/sample if there
-            # is no match.
-
-            # 4.1. Get the valid continuation, after the matching tokens
-            if self.generation_config.static_shapes:
-                input_ids[:, cur_len : cur_len + n_matches + 1] = valid_tokens
-            else:
-                input_ids = torch.cat((input_ids, valid_tokens), dim=-1)
-            if streamer is not None:
-                streamer.put(valid_tokens.cpu())
-            new_cur_len = input_ids.shape[-1]
-
-            # 4.2. Discard past key values relative to unused assistant tokens
-            new_cache_size = new_cur_len - 1
-            outputs.past_key_values = _crop_past_key_values(self, outputs.past_key_values, new_cache_size)
-
-            # 5. Update the candidate generation strategy if needed
-            candidate_generator.update_candidate_strategy(input_ids, new_logits, n_matches)
-
-            # Store scores, attentions and hidden_states when required
-            # Assistant: modified to append one tuple element per token, as in the other generation methods.
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += tuple(new_logits[:, i, :] for i in range(n_matches + 1))
-                if output_logits:
-                    raw_logits += (next_token_logits,)
-
-                if "past_key_values" not in model_kwargs:
-                    added_len = new_cur_len
-                else:
-                    added_len = n_matches + 1
-
-                if output_attentions:
-                    if self.config.is_encoder_decoder:
-                        cross_attentions = _split_model_outputs(
-                            cross_attentions, outputs.cross_attentions, cur_len, added_len
-                        )
-                        decoder_attentions = _split_model_outputs(
-                            decoder_attentions,
-                            outputs.decoder_attentions,
-                            cur_len,
-                            added_len,
-                            is_decoder_attention=True,
-                        )
-                    else:
-                        decoder_attentions = _split_model_outputs(
-                            decoder_attentions,
-                            outputs.attentions,
-                            cur_len,
-                            added_len,
-                            is_decoder_attention=True,
-                        )
-                if output_hidden_states:
-                    if self.config.is_encoder_decoder:
-                        decoder_hidden_states = _split_model_outputs(
-                            decoder_hidden_states, outputs.decoder_hidden_states, cur_len, added_len
-                        )
-                    else:
-                        decoder_hidden_states = _split_model_outputs(
-                            decoder_hidden_states, outputs.hidden_states, cur_len, added_len
-                        )
-
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
-                model_kwargs,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-
-            if ignore_eos:
-                this_peer_finished = stopping_criteria(
-                    input_ids, scores, token_idx=None, ignore_eos=ignore_eos, eos_token_id=eos_token_id
-                )
-            else:
-                unfinished_sequences = unfinished_sequences & ~stopping_criteria(
-                    input_ids, scores, token_idx=None, ignore_eos=ignore_eos, eos_token_id=eos_token_id
-                )
-                this_peer_finished = unfinished_sequences.max() == 0
-
-            hb_profer.step()
-            if hb_gen_time is not None:
-                if not time_to_first_token_done:
-                    time_to_first_token_done = True
-                    import habana_frameworks.torch.hpu as torch_hpu
-
-                    torch_hpu.synchronize()
-                hb_gen_time.step()
-
-            if this_peer_finished and not synced_gpus:
-                break
-
-        hb_profer.stop()
-        if streamer is not None:
-            streamer.end()
-
-        if (
-            hasattr(candidate_generator, "assistant_model")
-            and candidate_generator.assistant_model.generation_config.num_assistant_tokens_schedule == "heuristic"
-        ):
-            candidate_generator.assistant_model.generation_config.num_assistant_tokens = (
-                candidate_generator.num_assistant_tokens
-            )
-        if return_dict_in_generate:
-            if self.config.is_encoder_decoder:
-                return GenerateEncoderDecoderOutput(
-                    sequences=input_ids,
-                    scores=scores,
-                    logits=raw_logits,
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-            else:
-                return GenerateDecoderOnlyOutput(
-                    sequences=input_ids,
-                    scores=scores,
-                    logits=raw_logits,
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-        else:
-            return input_ids
diff --git a/optimum/habana/transformers/gradient_checkpointing.py b/optimum/habana/transformers/gradient_checkpointing.py
deleted file mode 100644
index 63dee25f46..0000000000
--- a/optimum/habana/transformers/gradient_checkpointing.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file just adds a mark_step() at the beginning of the
-# backward pass when gradient checkpointing is performed
-# Original implementation here: https://github.com/pytorch/pytorch/blob/v1.10.2/torch/utils/checkpoint.py
-
-import warnings
-from typing import Any, Iterable, Tuple
-
-import habana_frameworks.torch.core as htcore
-import habana_frameworks.torch.hpu as hthpu
-import torch
-
-
-def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
-    if isinstance(inputs, tuple):
-        out = []
-        for inp in inputs:
-            if not isinstance(inp, torch.Tensor):
-                out.append(inp)
-                continue
-
-            x = inp.detach()
-            x.requires_grad = inp.requires_grad
-            out.append(x)
-        return tuple(out)
-    else:
-        raise RuntimeError("Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
-
-
-def check_backward_validity(inputs: Iterable[Any]) -> None:
-    if not any(inp.requires_grad for inp in inputs if isinstance(inp, torch.Tensor)):
-        warnings.warn("None of the inputs have requires_grad=True. Gradients will be None")
-
-
-def _get_autocast_kwargs():
-    hpu_autocast_kwargs = {
-        "device_type": "hpu",
-        "enabled": hthpu.is_autocast_hpu_enabled(),
-        "dtype": hthpu.get_autocast_hpu_dtype(),
-    }
-
-    cpu_autocast_kwargs = {
-        "enabled": torch.is_autocast_cpu_enabled(),
-        "dtype": torch.get_autocast_cpu_dtype(),
-        "cache_enabled": torch.is_autocast_cache_enabled(),
-    }
-
-    return hpu_autocast_kwargs, cpu_autocast_kwargs
-
-
-class CheckpointFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, run_function, preserve_rng_state, *args):
-        if torch.is_grad_enabled():  # grad may be disabled, e.g., during validation
-            check_backward_validity(args)
-        ctx.run_function = run_function
-        ctx.preserve_rng_state = preserve_rng_state
-        ctx.hpu_autocast_kwargs, ctx.cpu_autocast_kwargs = _get_autocast_kwargs()
-        if preserve_rng_state:
-            ctx.fwd_cpu_state = torch.get_rng_state()
-            # Don't eagerly initialize the cuda context by accident.
-            # (If the user intends that the context is initialized later, within their
-            # run_function, we SHOULD actually stash the cuda state here.  Unfortunately,
-            # we have no way to anticipate this will happen before we run the function.)
-            ctx.had_cuda_in_fwd = False
-
-        # Save non-tensor inputs in ctx, keep a placeholder None for tensors
-        # to be filled out during the backward.
-        ctx.inputs = []
-        ctx.tensor_indices = []
-        tensor_inputs = []
-        for i, arg in enumerate(args):
-            if torch.is_tensor(arg):
-                tensor_inputs.append(arg)
-                ctx.tensor_indices.append(i)
-                ctx.inputs.append(None)
-            else:
-                ctx.inputs.append(arg)
-
-        ctx.save_for_backward(*tensor_inputs)
-
-        with torch.no_grad():
-            outputs = run_function(*args)
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        if not torch.autograd._is_checkpoint_valid():
-            raise RuntimeError(
-                "Checkpointing is not compatible with .grad() or when an `inputs` parameter"
-                " is passed to .backward(). Please use .backward() and do not pass its `inputs`"
-                " argument."
-            )
-
-        htcore.mark_step()
-
-        # Copy the list to avoid modifying original list.
-        inputs = list(ctx.inputs)
-        tensor_indices = ctx.tensor_indices
-        tensors = ctx.saved_tensors
-
-        # Fill in inputs with appropriate saved tensors.
-        for i, idx in enumerate(tensor_indices):
-            inputs[idx] = tensors[i]
-
-        # Stash the surrounding rng state, and mimic the state that was
-        # present at this time during forward.  Restore the surrounding state
-        # when we're done.
-        rng_devices = []
-        with torch.random.fork_rng(devices=rng_devices, enabled=ctx.preserve_rng_state):
-            if ctx.preserve_rng_state:
-                torch.set_rng_state(ctx.fwd_cpu_state)
-            detached_inputs = detach_variable(tuple(inputs))
-            with torch.enable_grad(), torch.autocast(**ctx.hpu_autocast_kwargs), torch.cpu.amp.autocast(
-                **ctx.cpu_autocast_kwargs
-            ):
-                outputs = ctx.run_function(*detached_inputs)
-
-        if isinstance(outputs, torch.Tensor):
-            outputs = (outputs,)
-
-        # run backward() with only tensor that requires grad
-        outputs_with_grad = []
-        args_with_grad = []
-        for i in range(len(outputs)):
-            if torch.is_tensor(outputs[i]) and outputs[i].requires_grad:
-                outputs_with_grad.append(outputs[i])
-                args_with_grad.append(args[i])
-        if len(outputs_with_grad) == 0:
-            raise RuntimeError("none of output has requires_grad=True, this checkpoint() is not necessary")
-        torch.autograd.backward(outputs_with_grad, args_with_grad)
-        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else None for inp in detached_inputs)
-
-        return (None, None) + grads
-
-
-def checkpoint(function, *args, **kwargs):
-    r"""Checkpoint a model or part of the model
-    Checkpointing works by trading compute for memory. Rather than storing all
-    intermediate activations of the entire computation graph for computing
-    backward, the checkpointed part does **not** save intermediate activations,
-    and instead recomputes them in backward pass. It can be applied on any part
-    of a model.
-    Specifically, in the forward pass, :attr:`function` will run in
-    :func:`torch.no_grad` manner, i.e., not storing the intermediate
-    activations. Instead, the forward pass saves the inputs tuple and the
-    :attr:`function` parameter. In the backwards pass, the saved inputs and
-    :attr:`function` is retrieved, and the forward pass is computed on
-    :attr:`function` again, now tracking the intermediate activations, and then
-    the gradients are calculated using these activation values.
-    The output of :attr:`function` can contain non-Tensor values and gradient
-    recording is only performed for the Tensor values. Note that if the output
-    consists of nested structures (ex: custom objects, lists, dicts etc.)
-    consisting of Tensors, these Tensors nested in custom structures will not
-    be considered as part of autograd.
-    .. warning::
-        Checkpointing currently only supports :func:`torch.autograd.backward`
-        and only if its `inputs` argument is not passed. :func:`torch.autograd.grad`
-        is not supported.
-    .. warning::
-        If :attr:`function` invocation during backward does anything different
-        than the one during forward, e.g., due to some global variable, the
-        checkpointed version won't be equivalent, and unfortunately it can't be
-        detected.
-    .. warning::
-        If checkpointed segment contains tensors detached from the computational
-        graph by `detach()` or `torch.no_grad()`, the backward pass will raise an
-        error. This is because `checkpoint` makes all the outputs require
-        gradients which causes issues when a tensor is defined to have no
-        gradient in the model. To circumvent this, detach the tensors outside of
-        the `checkpoint` function.
-    .. warning::
-        At least one of the inputs needs to have :code:`requires_grad=True` if
-        grads are needed for model inputs, otherwise the checkpointed part of the
-        model won't have gradients. At least one of the outputs needs to have
-        :code:`requires_grad=True` as well.
-    Args:
-        function: describes what to run in the forward pass of the model or
-            part of the model. It should also know how to handle the inputs
-            passed as the tuple. For example, in LSTM, if user passes
-            ``(activation, hidden)``, :attr:`function` should correctly use the
-            first input as ``activation`` and the second input as ``hidden``
-        preserve_rng_state(bool, optional, default=True):  Omit stashing and restoring
-            the RNG state during each checkpoint.
-        args: tuple containing inputs to the :attr:`function`
-    Returns:
-        Output of running :attr:`function` on :attr:`*args`
-    """
-    # Hack to mix *args with **kwargs in a python 2.7-compliant way
-    preserve = kwargs.pop("preserve_rng_state", True)
-    if kwargs:
-        raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs))
-
-    return CheckpointFunction.apply(function, preserve, *args)
diff --git a/optimum/habana/transformers/integrations/deepspeed.py b/optimum/habana/transformers/integrations/deepspeed.py
deleted file mode 100644
index 716d7cd141..0000000000
--- a/optimum/habana/transformers/integrations/deepspeed.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Integration with Deepspeed
-"""
-
-import torch
-from transformers.dependency_versions_check import dep_version_check
-from transformers.integrations.deepspeed import (
-    HfDeepSpeedConfig,
-    HfTrainerDeepSpeedConfig,
-    deepspeed_optim_sched,
-    set_hf_deepspeed_config,
-)
-
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class GaudiTrainerDeepSpeedConfig(HfTrainerDeepSpeedConfig):
-    """
-    Adapted from: https://github.com/huggingface/transformers/blob/6da93f5580e109fad5f7b523cf2b6e8a5bafb623/src/transformers/integrations/deepspeed.py#L69
-
-    The differences are:
-    - disable DeepSpeed version check as we run a custom version on HPU
-    - remove uncompatible args (e.g. fp16) in config processing
-    """
-
-    def __init__(self, config_file_or_dict):
-        set_hf_deepspeed_config(self)
-        dep_version_check("accelerate")
-        # dep_version_check("deepspeed")
-        super(HfDeepSpeedConfig, self).__init__(config_file_or_dict)
-        self._dtype = None
-        self.mismatches = []
-
-    def trainer_config_process(self, args, auto_find_batch_size=False):
-        """
-        Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
-        creation.
-        """
-        # DeepSpeed does:
-        # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
-        train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
-        self.fill_match(
-            "train_micro_batch_size_per_gpu",
-            args.per_device_train_batch_size,
-            "per_device_train_batch_size",
-            not auto_find_batch_size,
-        )
-        self.fill_match(
-            "gradient_accumulation_steps",
-            args.gradient_accumulation_steps,
-            "gradient_accumulation_steps",
-        )
-        self.fill_match(
-            "train_batch_size",
-            train_batch_size,
-            "train_batch_size (calculated)",
-            not auto_find_batch_size,
-        )
-        self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
-
-        self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
-        self.fill_match(
-            "optimizer.params.betas",
-            [args.adam_beta1, args.adam_beta2],
-            "adam_beta1+adam_beta2",
-        )
-        self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
-        self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
-
-        self.fill_only("scheduler.params.warmup_min_lr", 0)  # not a trainer arg
-        self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
-        # total_num_steps - will get set in trainer_config_finalize
-
-        if args.save_on_each_node:
-            # deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True
-            self.config["checkpoint"] = self.config.get("checkpoint", {})
-            self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node
-
-        # deepspeed's default mode is fp16 unless there is a config that says differently
-        if self.is_true("bf16.enabled"):
-            self._dtype = torch.bfloat16
-        else:
-            self._dtype = torch.float32
-
-
-def deepspeed_init(trainer, num_training_steps, inference=False):
-    """
-    Adapted from: https://github.com/huggingface/transformers/blob/6da93f5580e109fad5f7b523cf2b6e8a5bafb623/src/transformers/integrations/deepspeed.py#L316
-
-    The difference is:
-    - add a workaround to cast the model to the target dtype
-    """
-    from deepspeed.utils import logger as ds_logger
-
-    model = trainer.model
-    args = trainer.args
-
-    hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config
-
-    # TODO: temporary workaround
-    # To remove when it is solved, see https://github.com/HabanaAI/Model-References/blob/5df5baa261a677b63a2b20eab59981d05c191df5/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/run_pretraining.py#L608
-    model.to(dtype=hf_deepspeed_config.dtype(), device="hpu")
-
-    # resume config update - some bits like `model` and `num_training_steps` only become available during train
-    hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
-
-    # set the Deepspeed log level consistent with the Trainer
-    ds_logger.setLevel(args.get_process_log_level())
-
-    if inference:
-        # only Z3 makes sense for the inference
-        if not hf_deepspeed_config.is_zero3():
-            raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
-
-        # in case the training config is re-used for inference
-        hf_deepspeed_config.del_config_sub_tree("optimizer")
-        hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
-        optimizer, lr_scheduler = None, None
-        model_parameters = None
-    else:
-        trainer.optimizer = None  # important for when deepspeed_init is used as re-init
-        model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
-        optimizer, lr_scheduler = deepspeed_optim_sched(
-            trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
-        )
-
-    # keep for quick debug:
-    # from pprint import pprint; pprint(config)
-
-    return optimizer, lr_scheduler
-
-
-def unwrap_deepspeed_model(model):
-    if hasattr(model, "module"):
-        return model.module
-    return model
diff --git a/optimum/habana/transformers/modeling_attn_mask_utils.py b/optimum/habana/transformers/modeling_attn_mask_utils.py
deleted file mode 100755
index 1aa21bed30..0000000000
--- a/optimum/habana/transformers/modeling_attn_mask_utils.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import torch
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-
-
-@dataclass
-class GaudiAttentionMaskConverter(AttentionMaskConverter):
-    """
-    Adapted from: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L21
-
-    Differences:
-    - replace `triu` with similar logic here: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L169
-    """
-
-    @staticmethod
-    def _make_causal_mask(
-        input_ids_shape: torch.Size,
-        dtype: torch.dtype,
-        device: torch.device,
-        past_key_values_length: int = 0,
-        sliding_window: Optional[int] = None,
-    ):
-        """
-        Make causal mask used for bi-directional self-attention.
-        """
-        bsz, tgt_len = input_ids_shape
-        mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-        mask_cond = torch.arange(mask.size(-1), device=device)
-        mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-
-        mask = mask.to(dtype)
-
-        if past_key_values_length > 0:
-            mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-
-        # add lower triangular sliding window mask if necessary
-        if sliding_window is not None:
-            diagonal = past_key_values_length - sliding_window - 1
-
-            # Replace tril with below
-            row_indices = torch.arange(mask.size(0), device=mask.device).view(-1, 1)  # Reshape to column vector
-            col_indices = torch.arange(mask.size(1), device=mask.device)
-            context_mask = (col_indices <= row_indices + diagonal).bool().expand_as(mask)  # Expand to match mask shape
-
-            mask.masked_fill_(context_mask, torch.finfo(dtype).min)
-
-        return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-def _gaudi_prepare_4d_causal_attention_mask(
-    attention_mask: Optional[torch.Tensor],
-    input_shape: Union[torch.Size, Tuple, List],
-    inputs_embeds: torch.Tensor,
-    past_key_values_length: int,
-    sliding_window: Optional[int] = None,
-):
-    """
-    Adapted from: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_attn_mask_utils.py#L278
-
-    Differences:
-    - replace `AttentionMaskConverter` by `GaudiAttentionMaskConverter`
-    """
-    attn_mask_converter = GaudiAttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
-
-    key_value_length = input_shape[-1] + past_key_values_length
-
-    # 4d mask is passed through the layers
-    if attention_mask is not None and len(attention_mask.shape) == 2:
-        attention_mask = attn_mask_converter.to_4d(
-            attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
-        )
-    elif attention_mask is not None and len(attention_mask.shape) == 4:
-        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
-        if tuple(attention_mask.shape) != expected_shape:
-            raise ValueError(
-                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
-            )
-        else:
-            # if the 4D mask has correct shape - invert it and fill with negative infinity
-            inverted_mask = 1.0 - attention_mask
-            attention_mask = inverted_mask.masked_fill(
-                inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
-            )
-    else:
-        attention_mask = attn_mask_converter.to_causal_4d(
-            input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
-        )
-
-    return attention_mask
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
deleted file mode 100644
index 8d67911886..0000000000
--- a/optimum/habana/transformers/modeling_utils.py
+++ /dev/null
@@ -1,523 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import transformers
-import transformers.utils.fx
-
-from .generation import (
-    GaudiGenerationConfig,
-    GaudiGenerationMixin,
-    gaudi_EosTokenCriteria_call,
-    gaudi_MaxLengthCriteria_call,
-    gaudi_MaxNewTokensCriteria_call,
-    gaudi_MaxTimeCriteria_call,
-    gaudi_StoppingCriteriaList_call,
-)
-from .models import (
-    GaudiBloomForCausalLM,
-    GaudiBloomMLP,
-    GaudiCLIPVisionEmbeddings,
-    GaudiCodeGenAttention,
-    GaudiCodeGenForCausalLM,
-    GaudiFalconAttention,
-    GaudiFalconDecoderLayer,
-    GaudiFalconForCausalLM,
-    GaudiFalconMLP,
-    GaudiFalconModel,
-    GaudiGemmaDecoderLayer,
-    GaudiGemmaForCausalLM,
-    GaudiGPT2Attention,
-    GaudiGPT2Block,
-    GaudiGPT2LMHeadModel,
-    GaudiGPTBigCodeForCausalLM,
-    GaudiGPTJAttention,
-    GaudiGPTJBlock,
-    GaudiGPTJForCausalLM,
-    GaudiGPTNeoXForCausalLM,
-    GaudiLlamaAttention,
-    GaudiLlamaDecoderLayer,
-    GaudiLlamaDynamicNTKScalingRotaryEmbedding,
-    GaudiLlamaForCausalLM,
-    GaudiLlamaLinearScalingRotaryEmbedding,
-    GaudiLlamaMLP,
-    GaudiLlamaModel,
-    GaudiLlamaRotaryEmbedding,
-    GaudiLlavaForConditionalGeneration,
-    GaudiLlavaNextForConditionalGeneration,
-    GaudiMistralAttention,
-    GaudiMistralDecoderLayer,
-    GaudiMistralForCausalLM,
-    GaudiMistralModel,
-    GaudiMixtralAttention,
-    GaudiMixtralDecoderLayer,
-    GaudiMixtralForCausalLM,
-    GaudiMixtralModel,
-    GaudiMptForCausalLM,
-    GaudiMptModel,
-    GaudiOPTForCausalLM,
-    GaudiOPTLearnedPositionalEmbedding,
-    GaudiPersimmonForCausalLM,
-    GaudiPhiAttention,
-    GaudiPhiDecoderLayer,
-    GaudiPhiForCausalLM,
-    GaudiPhiModel,
-    GaudiQwen2Attention,
-    GaudiQwen2DecoderLayer,
-    GaudiQwen2ForCausalLM,
-    GaudiQwen2MLP,
-    GaudiQwen2Model,
-    GaudiStableLmDecoderLayer,
-    GaudiStableLmForCausalLM,
-    GaudiStarcoder2DecoderLayer,
-    GaudiStarcoder2ForCausalLM,
-    LlamaConfig,
-    MistralConfig,
-    MixtralConfig,
-    _gaudi_wav2vec2_compute_mask_indices,
-    _gaudi_wav2vec2_mask_hidden_states,
-    gaudi_albert_forward,
-    gaudi_BartAttention_forward,
-    gaudi_BartDecoder_forward,
-    gaudi_BartDecoderLayer_forward,
-    gaudi_BartEncoder_forward,
-    gaudi_BartEncoderLayer_forward,
-    gaudi_BartForConditionalGeneration_forward,
-    gaudi_BartForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_BartLearnedPositionalEmbedding,
-    gaudi_BartModel_forward,
-    gaudi_BlipForConditionalGeneration_generate,
-    gaudi_BlipForQuestionAnswering_generate,
-    gaudi_BlipTextAttention_forward,
-    gaudi_BlipTextEncoder_forward,
-    gaudi_BlipTextLayer_forward,
-    gaudi_BlipTextLMHead_forward,
-    gaudi_BlipTextLMHead_prepare_inputs_for_generation,
-    gaudi_BlipTextModel_forward,
-    gaudi_BlipTextSelfAttention_forward,
-    gaudi_bloom_attention_forward,
-    gaudi_bloom_block_forward,
-    gaudi_bloom_convert_to_bloom_cache,
-    gaudi_bloom_convert_to_standard_cache,
-    gaudi_bloom_model_forward,
-    gaudi_check_and_enable_sdpa,
-    gaudi_codegen_block_forward,
-    gaudi_codegen_model_forward,
-    gaudi_conv1d_forward,
-    gaudi_esm_for_protein_folding_forward,
-    gaudi_esmfolding_trunk_forward,
-    gaudi_falcon_linear_forward,
-    gaudi_gemma_attention_forward,
-    gaudi_gemma_model_forward,
-    gaudi_generate_speech,
-    gaudi_get_extended_attention_mask,
-    gaudi_gpt2_forward,
-    gaudi_gpt_bigcode_attention_forward,
-    gaudi_gpt_bigcode_block_forward,
-    gaudi_gpt_bigcode_model_forward,
-    gaudi_gpt_neox_attention_forward,
-    gaudi_gpt_neox_layer_forward,
-    gaudi_gpt_neox_model_forward,
-    gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache,
-    gaudi_gptj_model_forward,
-    gaudi_invert_attention_mask,
-    gaudi_llama_rmsnorm_forward,
-    gaudi_mistral_rmsnorm_forward,
-    gaudi_mixtral_block_sparse_moe_forward,
-    gaudi_mixtral_rmsnorm_forward,
-    gaudi_mpt_attention_forward,
-    gaudi_mpt_block_forward,
-    gaudi_opt_attention_forward,
-    gaudi_opt_decoder_forward,
-    gaudi_opt_decoder_layer_forward,
-    gaudi_opt_model_forward,
-    gaudi_owlvitclasspredictionhead_forward,
-    gaudi_persimmon_attention_forward,
-    gaudi_persimmon_decoder_layer_forward,
-    gaudi_persimmon_model_forward,
-    gaudi_qwen2_rmsnorm_forward,
-    gaudi_rot_matmul,
-    gaudi_rot_vec_mul,
-    gaudi_SeamlessM4TAttention_forward,
-    gaudi_SeamlessM4TCodeHifiGan_get_output_hifigan_lengths,
-    gaudi_SeamlessM4TDecoder_forward,
-    gaudi_SeamlessM4TDecoderLayer_forward,
-    gaudi_SeamlessM4TForTextToSpeech_forward,
-    gaudi_SeamlessM4TForTextToSpeech_generate,
-    gaudi_SeamlessM4TForTextToSpeech_prepare_inputs_for_generation,
-    gaudi_SeamlessM4TTextToUnitForConditionalGeneration_forward,
-    gaudi_SeamlessM4TTextToUnitForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_SeamlessM4TTextToUnitModel_forward,
-    gaudi_SpeechT5Attention_forward,
-    gaudi_SpeechT5Decoder_forward,
-    gaudi_SpeechT5DecoderLayer_forward,
-    gaudi_stablelm_attention_forward,
-    gaudi_stablelm_model_forward,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
-    gaudi_swin_get_attn_mask,
-    gaudi_t5_layernorm_forward,
-    gaudi_T5Attention_forward,
-    gaudi_T5Block_forward,
-    gaudi_T5ForConditionalGeneration_forward,
-    gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_T5LayerSelfAttention_forward,
-    gaudi_T5Stack_forward,
-    gaudi_unconstrained_rational_quadratic_spline,
-    gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
-    gaudi_vit_self_attention_forward,
-    gaudi_VitsResidualCouplingLayer_forward,
-    gaudi_wav2vec2_encoder_forward,
-    gaudi_wav2vec2_forward,
-    gaudi_wav2vec2_tdnnlayer_forward,
-    gaudi_wav2vec2forctc_forward,
-)
-
-
-def adapt_transformers_to_gaudi():
-    """
-    Replaces some Transformers' methods for equivalent methods optimized
-    for Gaudi.
-    """
-
-    # models that support symbolic tracing should be added to this list
-    models_with_tracing_support = []
-
-    # optimize Conv1D
-    transformers.pytorch_utils.Conv1D.forward = gaudi_conv1d_forward
-
-    # Optimization tweak for ViT
-    transformers.models.vit.modeling_vit.ViTSelfAttention.forward = gaudi_vit_self_attention_forward
-
-    # Optimization tweak for Swin
-    transformers.models.swin.modeling_swin.SwinLayer.get_attn_mask = gaudi_swin_get_attn_mask
-
-    # Optimization tweak for Wav2Vec2
-    transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices = _gaudi_wav2vec2_compute_mask_indices
-    # transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices = _gaudi_wav2vec2_sample_negative_indices
-    transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states = (
-        _gaudi_wav2vec2_mask_hidden_states
-    )
-    transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward = gaudi_wav2vec2_forward
-    transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder.forward = gaudi_wav2vec2_encoder_forward
-    transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward = gaudi_wav2vec2forctc_forward
-    transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer.forward = gaudi_wav2vec2_tdnnlayer_forward
-
-    # Generation is modified to run faster in lazy mode
-    transformers.generation.GenerationMixin.generate = GaudiGenerationMixin.generate
-    transformers.generation.GenerationMixin._update_model_kwargs_for_generation = (
-        GaudiGenerationMixin._update_model_kwargs_for_generation
-    )
-    transformers.generation.GenerationMixin.update_model_kwargs_for_bucketing = (
-        GaudiGenerationMixin.update_model_kwargs_for_bucketing
-    )
-    transformers.generation.GenerationMixin._get_hpu_graphs_kwargs = GaudiGenerationMixin._get_hpu_graphs_kwargs
-    transformers.generation.GenerationMixin._pad_past_key_values = GaudiGenerationMixin._pad_past_key_values
-    transformers.generation.GenerationMixin._remove_past_key_values = GaudiGenerationMixin._remove_past_key_values
-    transformers.generation.GenerationMixin._expand_inputs_for_generation = staticmethod(
-        GaudiGenerationMixin._expand_inputs_for_generation
-    )
-    transformers.generation.GenerationMixin._prepare_attention_mask_for_generation = (
-        GaudiGenerationMixin._prepare_attention_mask_for_generation
-    )
-    transformers.generation.GenerationMixin._prepare_decoder_input_ids_for_generation = (
-        GaudiGenerationMixin._prepare_decoder_input_ids_for_generation
-    )
-    transformers.generation.GenerationMixin._prepare_decoder_attention_mask = (
-        GaudiGenerationMixin._prepare_decoder_attention_mask
-    )
-    transformers.generation.GenerationMixin._prepare_generation_config = (
-        GaudiGenerationMixin._prepare_generation_config
-    )
-    transformers.generation.GenerationMixin._prepare_generated_length = GaudiGenerationMixin._prepare_generated_length
-    transformers.generation.GenerationMixin._get_stopping_criteria = GaudiGenerationMixin._get_stopping_criteria
-    transformers.generation.GenerationMixin._validate_model_kwargs = GaudiGenerationMixin._validate_model_kwargs
-    transformers.generation.GenerationMixin._greedy_search = GaudiGenerationMixin._greedy_search
-    transformers.generation.GenerationMixin._sample = GaudiGenerationMixin._sample
-    transformers.generation.GenerationMixin._beam_search = GaudiGenerationMixin._beam_search
-    transformers.generation.GenerationMixin._beam_sample = GaudiGenerationMixin._beam_sample
-    transformers.generation.GenerationMixin._group_beam_search = GaudiGenerationMixin._group_beam_search
-    transformers.generation.GenerationMixin._constrained_beam_search = GaudiGenerationMixin._constrained_beam_search
-    transformers.generation.GenerationMixin._assisted_decoding = GaudiGenerationMixin._assisted_decoding
-    transformers.generation.GenerationMixin._get_candidate_generator = GaudiGenerationMixin._get_candidate_generator
-    transformers.generation.GenerationConfig = GaudiGenerationConfig
-    transformers.modeling_utils.GenerationConfig = GaudiGenerationConfig
-    transformers.generation.MaxLengthCriteria.__call__ = gaudi_MaxLengthCriteria_call
-    transformers.generation.MaxNewTokensCriteria.__call__ = gaudi_MaxNewTokensCriteria_call
-    transformers.generation.MaxTimeCriteria.__call__ = gaudi_MaxTimeCriteria_call
-    transformers.generation.EosTokenCriteria.__call__ = gaudi_EosTokenCriteria_call
-    transformers.generation.StoppingCriteriaList.__call__ = gaudi_StoppingCriteriaList_call
-
-    # Optimization for BLOOM generation on Gaudi
-    transformers.models.bloom.modeling_bloom.BloomAttention.forward = gaudi_bloom_attention_forward
-    transformers.models.bloom.modeling_bloom.BloomBlock.forward = gaudi_bloom_block_forward
-    transformers.models.bloom.modeling_bloom.BloomModel.forward = gaudi_bloom_model_forward
-    transformers.models.bloom.modeling_bloom.BloomMLP = GaudiBloomMLP
-    transformers.models.bloom.modeling_bloom.BloomForCausalLM = GaudiBloomForCausalLM
-    transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._convert_to_standard_cache = (
-        gaudi_bloom_convert_to_standard_cache
-    )
-    transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._convert_to_bloom_cache = (
-        gaudi_bloom_convert_to_bloom_cache
-    )
-
-    # Optimization for BART generation on Gaudi
-    transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding = gaudi_BartLearnedPositionalEmbedding
-    transformers.models.bart.modeling_bart.BartAttention.forward = gaudi_BartAttention_forward
-    transformers.models.bart.modeling_bart.BartEncoderLayer.forward = gaudi_BartEncoderLayer_forward
-    transformers.models.bart.modeling_bart.BartDecoderLayer.forward = gaudi_BartDecoderLayer_forward
-    transformers.models.bart.modeling_bart.BartEncoder.forward = gaudi_BartEncoder_forward
-    transformers.models.bart.modeling_bart.BartDecoder.forward = gaudi_BartDecoder_forward
-    transformers.models.bart.modeling_bart.BartModel.forward = gaudi_BartModel_forward
-    transformers.models.bart.modeling_bart.BartForConditionalGeneration.forward = (
-        gaudi_BartForConditionalGeneration_forward
-    )
-    transformers.models.bart.modeling_bart.BartForConditionalGeneration.prepare_inputs_for_generation = (
-        gaudi_BartForConditionalGeneration_prepare_inputs_for_generation
-    )
-
-    # Optimization for codegen generation on Gaudi
-    transformers.models.codegen.modeling_codegen.CodeGenAttention = GaudiCodeGenAttention
-    transformers.models.codegen.modeling_codegen.CodeGenForCausalLM = GaudiCodeGenForCausalLM
-    transformers.models.codegen.modeling_codegen.CodeGenModel.forward = gaudi_codegen_model_forward
-    transformers.models.codegen.modeling_codegen.CodeGenBlock.forward = gaudi_codegen_block_forward
-
-    # Replace invert_attention_mask and get_extended_attention_mask
-    # so that Torch Autocast is disabled for specific parts of the code
-    transformers.modeling_utils.ModuleUtilsMixin.invert_attention_mask = gaudi_invert_attention_mask
-    transformers.modeling_utils.ModuleUtilsMixin.get_extended_attention_mask = gaudi_get_extended_attention_mask
-
-    # Override sdpa check on Gaudi
-    transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa = gaudi_check_and_enable_sdpa
-
-    # AlbertModel.forward does not rely on get_extended_attention_mask so it also needs to be replaced
-    transformers.models.albert.modeling_albert.AlbertModel.forward = gaudi_albert_forward
-
-    # Optimization for GPT2 on Gaudi
-    transformers.models.gpt2.modeling_gpt2.GPT2Attention = GaudiGPT2Attention
-    transformers.models.gpt2.modeling_gpt2.GPT2Model.forward = gaudi_gpt2_forward
-    transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel = GaudiGPT2LMHeadModel
-    transformers.models.gpt2.modeling_gpt2.GPT2Block = GaudiGPT2Block
-    models_with_tracing_support.extend((GaudiGPT2Attention, GaudiGPT2LMHeadModel))
-
-    # Optimization for EsmFold on Gaudi
-    transformers.models.esm.modeling_esmfold.EsmFoldingTrunk.forward = gaudi_esmfolding_trunk_forward
-    transformers.models.esm.modeling_esmfold.EsmForProteinFolding.forward = gaudi_esm_for_protein_folding_forward
-    transformers.models.esm.openfold_utils.rigid_utils.rot_matmul = gaudi_rot_matmul
-    transformers.models.esm.openfold_utils.rigid_utils.rot_vec_mul = gaudi_rot_vec_mul
-
-    # Optimization for OPT generation on Gaudi
-    transformers.models.opt.modeling_opt.OPTAttention.forward = gaudi_opt_attention_forward
-    transformers.models.opt.modeling_opt.OPTDecoder.forward = gaudi_opt_decoder_forward
-    transformers.models.opt.modeling_opt.OPTForCausalLM = GaudiOPTForCausalLM
-    transformers.models.opt.modeling_opt.OPTModel.forward = gaudi_opt_model_forward
-    transformers.models.opt.modeling_opt.OPTDecoderLayer.forward = gaudi_opt_decoder_layer_forward
-    transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding = GaudiOPTLearnedPositionalEmbedding
-
-    # Optimization for GPTJ on Gaudi
-    transformers.models.gptj.modeling_gptj.GPTJAttention = GaudiGPTJAttention
-    transformers.models.gptj.modeling_gptj.GPTJForCausalLM = GaudiGPTJForCausalLM
-    transformers.models.gptj.modeling_gptj.GPTJBlock = GaudiGPTJBlock
-    transformers.models.gptj.modeling_gptj.GPTJModel.forward = gaudi_gptj_model_forward
-
-    # Optimization for GPTBigCode on Gaudi
-    transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeAttention.forward = (
-        gaudi_gpt_bigcode_attention_forward
-    )
-    transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM = GaudiGPTBigCodeForCausalLM
-    transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeBlock.forward = gaudi_gpt_bigcode_block_forward
-    transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeModel.forward = gaudi_gpt_bigcode_model_forward
-
-    # Optimization for gpt-neox generation on Gaudi
-    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM = GaudiGPTNeoXForCausalLM
-    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXModel.forward = gaudi_gpt_neox_model_forward
-    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXLayer.forward = gaudi_gpt_neox_layer_forward
-    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention.forward = gaudi_gpt_neox_attention_forward
-    transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding._set_cos_sin_cache = (
-        gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache
-    )
-
-    # Optimization for llama generation on Gaudi
-    transformers.models.llama.modeling_llama.LlamaForCausalLM = GaudiLlamaForCausalLM
-    transformers.models.llama.modeling_llama.LlamaModel = GaudiLlamaModel
-    transformers.models.llama.modeling_llama.LlamaAttention = GaudiLlamaAttention
-    transformers.models.llama.modeling_llama.LlamaMLP = GaudiLlamaMLP
-    transformers.models.llama.modeling_llama.LlamaDecoderLayer = GaudiLlamaDecoderLayer
-    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = GaudiLlamaRotaryEmbedding
-    transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding = GaudiLlamaLinearScalingRotaryEmbedding
-    transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding = (
-        GaudiLlamaDynamicNTKScalingRotaryEmbedding
-    )
-    transformers.models.llama.modeling_llama.LlamaRMSNorm.forward = gaudi_llama_rmsnorm_forward
-    transformers.models.llama.configuration_llama.LlamaConfig = LlamaConfig
-
-    # Optimization for llava on Gaudi
-    transformers.models.llava.modeling_llava.LlavaForConditionalGeneration = GaudiLlavaForConditionalGeneration
-    transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration = (
-        GaudiLlavaNextForConditionalGeneration
-    )
-
-    # Optimization for Clip on Gaudi
-    transformers.models.clip.modeling_clip.CLIPVisionEmbeddings = GaudiCLIPVisionEmbeddings
-
-    # Optimization for falcon generation on Gaudi
-    transformers.models.falcon.modeling_falcon.FalconAttention = GaudiFalconAttention
-    transformers.models.falcon.modeling_falcon.FalconForCausalLM = GaudiFalconForCausalLM
-    transformers.models.falcon.modeling_falcon.FalconMLP = GaudiFalconMLP
-    transformers.models.falcon.modeling_falcon.FalconModel = GaudiFalconModel
-    transformers.models.falcon.modeling_falcon.FalconDecoderLayer = GaudiFalconDecoderLayer
-    transformers.models.falcon.modeling_falcon.FalconLinear.forward = gaudi_falcon_linear_forward
-
-    # Optimization for t5 on Gaudi
-    transformers.models.t5.modeling_t5.T5LayerNorm.forward = gaudi_t5_layernorm_forward
-    transformers.models.t5.modeling_t5.T5Stack.forward = gaudi_T5Stack_forward
-    transformers.models.t5.modeling_t5.T5LayerSelfAttention.forward = gaudi_T5LayerSelfAttention_forward
-    transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward = gaudi_T5ForConditionalGeneration_forward
-    transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation = (
-        gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation
-    )
-    transformers.models.t5.modeling_t5.T5Attention.forward = gaudi_T5Attention_forward
-    transformers.models.t5.modeling_t5.T5Block.forward = gaudi_T5Block_forward
-
-    # Optimization for mpt on Gaudi
-    transformers.models.mpt.modeling_mpt.MptForCausalLM = GaudiMptForCausalLM
-    transformers.models.mpt.modeling_mpt.MptModel = GaudiMptModel
-    transformers.models.mpt.modeling_mpt.MptAttention.forward = gaudi_mpt_attention_forward
-    transformers.models.mpt.modeling_mpt.MptBlock.forward = gaudi_mpt_block_forward
-
-    # Optimization for mistral on Gaudi
-    transformers.models.mistral.modeling_mistral.MistralForCausalLM = GaudiMistralForCausalLM
-    transformers.models.mistral.modeling_mistral.MistralAttention = GaudiMistralAttention
-    transformers.models.mistral.modeling_mistral.MistralDecoderLayer = GaudiMistralDecoderLayer
-    transformers.models.mistral.modeling_mistral.MistralModel = GaudiMistralModel
-    transformers.models.mistral.modeling_mistral.MistralRMSNorm.forward = gaudi_mistral_rmsnorm_forward
-    transformers.models.mistral.configuration_mistral.MistralConfig = MistralConfig
-
-    # Optimization for phi on Gaudi
-    transformers.models.phi.modeling_phi.PhiForCausalLM = GaudiPhiForCausalLM
-    transformers.models.phi.modeling_phi.PhiAttention = GaudiPhiAttention
-    transformers.models.phi.modeling_phi.PhiDecoderLayer = GaudiPhiDecoderLayer
-    transformers.models.phi.modeling_phi.PhiModel = GaudiPhiModel
-
-    # Optimization for gemma on Gaudi
-    transformers.models.gemma.modeling_gemma.GemmaForCausalLM = GaudiGemmaForCausalLM
-    transformers.models.gemma.modeling_gemma.GemmaAttention.forward = gaudi_gemma_attention_forward
-    transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = GaudiGemmaDecoderLayer
-    transformers.models.gemma.modeling_gemma.GemmaModel.forward = gaudi_gemma_model_forward
-
-    # Optimization for blip Text model on Gaudi
-    transformers.models.blip.BlipTextModel.forward = gaudi_BlipTextModel_forward
-    transformers.models.blip.modeling_blip_text.BlipTextLMHeadModel.forward = gaudi_BlipTextLMHead_forward
-    transformers.models.blip.modeling_blip_text.BlipTextLMHeadModel.prepare_inputs_for_generation = (
-        gaudi_BlipTextLMHead_prepare_inputs_for_generation
-    )
-    transformers.models.blip.modeling_blip_text.BlipTextEncoder.forward = gaudi_BlipTextEncoder_forward
-    transformers.models.blip.modeling_blip_text.BlipTextLayer.forward = gaudi_BlipTextLayer_forward
-    transformers.models.blip.modeling_blip_text.BlipTextAttention.forward = gaudi_BlipTextAttention_forward
-    transformers.models.blip.modeling_blip_text.BlipTextSelfAttention.forward = gaudi_BlipTextSelfAttention_forward
-    transformers.models.blip.BlipForQuestionAnswering.generate = gaudi_BlipForQuestionAnswering_generate
-    transformers.models.blip.BlipForConditionalGeneration.generate = gaudi_BlipForConditionalGeneration_generate
-
-    # Optimization for mixtral on Gaudi
-    transformers.models.mixtral.modeling_mixtral.MixtralAttention = GaudiMixtralAttention
-    transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM = GaudiMixtralForCausalLM
-    transformers.models.mixtral.modeling_mixtral.MixtralModel = GaudiMixtralModel
-    transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock.forward = gaudi_mixtral_block_sparse_moe_forward
-    transformers.models.mixtral.modeling_mixtral.MixtralDecoderLayer = GaudiMixtralDecoderLayer
-    transformers.models.mixtral.modeling_mixtral.MixtralRMSNorm.forward = gaudi_mixtral_rmsnorm_forward
-    transformers.models.mixtral.configuration_mixtral.MixtralConfig = MixtralConfig
-
-    # Optimization for speecht5 on Gaudi
-    transformers.models.speecht5.modeling_speecht5.SpeechT5Decoder.forward = gaudi_SpeechT5Decoder_forward
-    transformers.models.speecht5.modeling_speecht5.SpeechT5DecoderLayer.forward = gaudi_SpeechT5DecoderLayer_forward
-    transformers.models.speecht5.modeling_speecht5.SpeechT5Attention.forward = gaudi_SpeechT5Attention_forward
-    transformers.models.speecht5.modeling_speecht5._generate_speech = gaudi_generate_speech
-
-    # Optimization for persimmon on Gaudi
-    transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM = GaudiPersimmonForCausalLM
-    transformers.models.persimmon.modeling_persimmon.PersimmonModel.forward = gaudi_persimmon_model_forward
-    transformers.models.persimmon.modeling_persimmon.PersimmonAttention.forward = gaudi_persimmon_attention_forward
-    transformers.models.persimmon.modeling_persimmon.PersimmonDecoderLayer.forward = (
-        gaudi_persimmon_decoder_layer_forward
-    )
-
-    # Optimization for seamless m4t on Gaudi
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TAttention.forward = (
-        gaudi_SeamlessM4TAttention_forward
-    )
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoderLayer.forward = (
-        gaudi_SeamlessM4TDecoderLayer_forward
-    )
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoder.forward = (
-        gaudi_SeamlessM4TDecoder_forward
-    )
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitModel.forward = (
-        gaudi_SeamlessM4TTextToUnitModel_forward
-    )
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.forward = (
-        gaudi_SeamlessM4TTextToUnitForConditionalGeneration_forward
-    )
-
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.prepare_inputs_for_generation = gaudi_SeamlessM4TTextToUnitForConditionalGeneration_prepare_inputs_for_generation
-
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_output_hifigan_lengths = (
-        gaudi_SeamlessM4TCodeHifiGan_get_output_hifigan_lengths
-    )
-
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.forward = (
-        gaudi_SeamlessM4TForTextToSpeech_forward
-    )
-
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.generate = (
-        gaudi_SeamlessM4TForTextToSpeech_generate
-    )
-
-    transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.prepare_inputs_for_generation = (
-        gaudi_SeamlessM4TForTextToSpeech_prepare_inputs_for_generation
-    )
-
-    transformers.models.vits.modeling_vits._unconstrained_rational_quadratic_spline = (
-        gaudi_unconstrained_rational_quadratic_spline
-    )
-    transformers.models.vits.modeling_vits.VitsResidualCouplingLayer.forward = gaudi_VitsResidualCouplingLayer_forward
-
-    # Optimization for starcoder2 on Gaudi
-    transformers.models.starcoder2.modeling_starcoder2.Starcoder2ForCausalLM = GaudiStarcoder2ForCausalLM
-    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Model.forward = gaudi_starcoder2_model_forward
-    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Attention.forward = gaudi_starcoder2_attention_forward
-    transformers.models.starcoder2.modeling_starcoder2.Starcoder2DecoderLayer = GaudiStarcoder2DecoderLayer
-
-    # Optimization for qwen2 on Gaudi
-    transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM = GaudiQwen2ForCausalLM
-    transformers.models.qwen2.modeling_qwen2.Qwen2Model = GaudiQwen2Model
-    transformers.models.qwen2.modeling_qwen2.Qwen2Attention = GaudiQwen2Attention
-    transformers.models.qwen2.modeling_qwen2.Qwen2MLP = GaudiQwen2MLP
-    transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer = GaudiQwen2DecoderLayer
-    transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm.forward = gaudi_qwen2_rmsnorm_forward
-
-    # Optimization for stablelm on Gaudi
-    transformers.models.stablelm.modeling_stablelm.StableLmForCausalLM = GaudiStableLmForCausalLM
-    transformers.models.stablelm.modeling_stablelm.StableLmModel.forward = gaudi_stablelm_model_forward
-    transformers.models.stablelm.modeling_stablelm.StableLmAttention.forward = gaudi_stablelm_attention_forward
-    transformers.models.stablelm.modeling_stablelm.StableLmDecoderLayer = GaudiStableLmDecoderLayer
-
-    transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder.VisionEncoderDecoderModel.prepare_inputs_for_generation = gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation
-
-    # Optimization for Owl ViT model on Gaudi
-    transformers.models.owlvit.modeling_owlvit.OwlViTClassPredictionHead.forward = (
-        gaudi_owlvitclasspredictionhead_forward
-    )
-
-    # Tell transformers which Gaudi models support tracing
-    transformers.utils.fx._SUPPORTED_MODELS += tuple(cls.__name__ for cls in models_with_tracing_support)
diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
deleted file mode 100644
index f17018e317..0000000000
--- a/optimum/habana/transformers/models/__init__.py
+++ /dev/null
@@ -1,208 +0,0 @@
-from .albert import gaudi_albert_forward
-from .bart import (
-    gaudi_BartAttention_forward,
-    gaudi_BartDecoder_forward,
-    gaudi_BartDecoderLayer_forward,
-    gaudi_BartEncoder_forward,
-    gaudi_BartEncoderLayer_forward,
-    gaudi_BartForConditionalGeneration_forward,
-    gaudi_BartForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_BartLearnedPositionalEmbedding,
-    gaudi_BartModel_forward,
-)
-from .blip import (
-    gaudi_BlipForConditionalGeneration_generate,
-    gaudi_BlipForQuestionAnswering_generate,
-    gaudi_BlipTextAttention_forward,
-    gaudi_BlipTextEncoder_forward,
-    gaudi_BlipTextLayer_forward,
-    gaudi_BlipTextLMHead_forward,
-    gaudi_BlipTextLMHead_prepare_inputs_for_generation,
-    gaudi_BlipTextModel_forward,
-    gaudi_BlipTextSelfAttention_forward,
-)
-from .bloom import (
-    GaudiBloomForCausalLM,
-    GaudiBloomMLP,
-    gaudi_bloom_attention_forward,
-    gaudi_bloom_block_forward,
-    gaudi_bloom_convert_to_bloom_cache,
-    gaudi_bloom_convert_to_standard_cache,
-    gaudi_bloom_model_forward,
-)
-from .clip import GaudiCLIPVisionEmbeddings
-from .codegen import (
-    GaudiCodeGenAttention,
-    GaudiCodeGenForCausalLM,
-    gaudi_codegen_block_forward,
-    gaudi_codegen_model_forward,
-)
-from .esm import (
-    gaudi_esm_for_protein_folding_forward,
-    gaudi_esmfolding_trunk_forward,
-    gaudi_rot_matmul,
-    gaudi_rot_vec_mul,
-)
-from .falcon import (
-    GaudiFalconAttention,
-    GaudiFalconDecoderLayer,
-    GaudiFalconForCausalLM,
-    GaudiFalconMLP,
-    GaudiFalconModel,
-    gaudi_falcon_linear_forward,
-)
-from .gemma import (
-    GaudiGemmaDecoderLayer,
-    GaudiGemmaForCausalLM,
-    gaudi_gemma_attention_forward,
-    gaudi_gemma_model_forward,
-)
-from .gpt2 import GaudiGPT2Attention, GaudiGPT2Block, GaudiGPT2LMHeadModel, gaudi_gpt2_forward
-from .gpt_bigcode import (
-    GaudiGPTBigCodeForCausalLM,
-    gaudi_gpt_bigcode_attention_forward,
-    gaudi_gpt_bigcode_block_forward,
-    gaudi_gpt_bigcode_model_forward,
-)
-from .gpt_neox import (
-    GaudiGPTNeoXForCausalLM,
-    gaudi_gpt_neox_attention_forward,
-    gaudi_gpt_neox_layer_forward,
-    gaudi_gpt_neox_model_forward,
-    gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache,
-)
-from .gptj import (
-    GaudiGPTJAttention,
-    GaudiGPTJBlock,
-    GaudiGPTJForCausalLM,
-    gaudi_gptj_model_forward,
-)
-from .llama import (
-    GaudiLlamaAttention,
-    GaudiLlamaDecoderLayer,
-    GaudiLlamaDynamicNTKScalingRotaryEmbedding,
-    GaudiLlamaForCausalLM,
-    GaudiLlamaLinearScalingRotaryEmbedding,
-    GaudiLlamaMLP,
-    GaudiLlamaModel,
-    GaudiLlamaRotaryEmbedding,
-    LlamaConfig,
-    gaudi_llama_rmsnorm_forward,
-)
-from .llava import GaudiLlavaForConditionalGeneration
-from .llava_next import GaudiLlavaNextForConditionalGeneration
-from .mistral import (
-    GaudiMistralAttention,
-    GaudiMistralDecoderLayer,
-    GaudiMistralForCausalLM,
-    GaudiMistralModel,
-    MistralConfig,
-    gaudi_mistral_rmsnorm_forward,
-)
-from .mixtral import (
-    GaudiMixtralAttention,
-    GaudiMixtralDecoderLayer,
-    GaudiMixtralForCausalLM,
-    GaudiMixtralModel,
-    MixtralConfig,
-    gaudi_mixtral_block_sparse_moe_forward,
-    gaudi_mixtral_rmsnorm_forward,
-)
-from .modeling_all_models import (
-    gaudi_check_and_enable_sdpa,
-    gaudi_conv1d_forward,
-    gaudi_get_extended_attention_mask,
-    gaudi_invert_attention_mask,
-)
-from .mpt import (
-    GaudiMptForCausalLM,
-    GaudiMptModel,
-    gaudi_mpt_attention_forward,
-    gaudi_mpt_block_forward,
-)
-from .opt import (
-    GaudiOPTForCausalLM,
-    GaudiOPTLearnedPositionalEmbedding,
-    gaudi_opt_attention_forward,
-    gaudi_opt_decoder_forward,
-    gaudi_opt_decoder_layer_forward,
-    gaudi_opt_model_forward,
-)
-from .owlvit import gaudi_owlvitclasspredictionhead_forward
-from .persimmon import (
-    GaudiPersimmonForCausalLM,
-    gaudi_persimmon_attention_forward,
-    gaudi_persimmon_decoder_layer_forward,
-    gaudi_persimmon_model_forward,
-)
-from .phi import (
-    GaudiPhiAttention,
-    GaudiPhiDecoderLayer,
-    GaudiPhiForCausalLM,
-    GaudiPhiModel,
-)
-from .qwen2 import (
-    GaudiQwen2Attention,
-    GaudiQwen2DecoderLayer,
-    GaudiQwen2ForCausalLM,
-    GaudiQwen2MLP,
-    GaudiQwen2Model,
-    gaudi_qwen2_rmsnorm_forward,
-)
-from .seamless_m4t import (
-    gaudi_SeamlessM4TAttention_forward,
-    gaudi_SeamlessM4TCodeHifiGan_get_output_hifigan_lengths,
-    gaudi_SeamlessM4TDecoder_forward,
-    gaudi_SeamlessM4TDecoderLayer_forward,
-    gaudi_SeamlessM4TForTextToSpeech_forward,
-    gaudi_SeamlessM4TForTextToSpeech_generate,
-    gaudi_SeamlessM4TForTextToSpeech_prepare_inputs_for_generation,
-    gaudi_SeamlessM4TTextToUnitForConditionalGeneration_forward,
-    gaudi_SeamlessM4TTextToUnitForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_SeamlessM4TTextToUnitModel_forward,
-)
-from .speecht5 import (
-    gaudi_generate_speech,
-    gaudi_SpeechT5Attention_forward,
-    gaudi_SpeechT5Decoder_forward,
-    gaudi_SpeechT5DecoderLayer_forward,
-)
-from .stablelm import (
-    GaudiStableLmDecoderLayer,
-    GaudiStableLmForCausalLM,
-    gaudi_stablelm_attention_forward,
-    gaudi_stablelm_model_forward,
-)
-from .starcoder2 import (
-    GaudiStarcoder2DecoderLayer,
-    GaudiStarcoder2ForCausalLM,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
-)
-from .swin import gaudi_swin_get_attn_mask
-from .t5 import (
-    gaudi_t5_layernorm_forward,
-    gaudi_T5Attention_forward,
-    gaudi_T5Block_forward,
-    gaudi_T5ForConditionalGeneration_forward,
-    gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_T5LayerSelfAttention_forward,
-    gaudi_T5Stack_forward,
-)
-from .vision_encoder_decoder import (
-    gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
-)
-from .vit import gaudi_vit_self_attention_forward
-from .vits import (
-    gaudi_unconstrained_rational_quadratic_spline,
-    gaudi_VitsResidualCouplingLayer_forward,
-)
-from .wav2vec2 import (
-    _gaudi_wav2vec2_compute_mask_indices,
-    _gaudi_wav2vec2_mask_hidden_states,
-    _gaudi_wav2vec2_sample_negative_indices,
-    gaudi_wav2vec2_encoder_forward,
-    gaudi_wav2vec2_forward,
-    gaudi_wav2vec2_tdnnlayer_forward,
-    gaudi_wav2vec2forctc_forward,
-)
diff --git a/optimum/habana/transformers/models/albert/__init__.py b/optimum/habana/transformers/models/albert/__init__.py
deleted file mode 100644
index fccea02a85..0000000000
--- a/optimum/habana/transformers/models/albert/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modeling_albert import gaudi_albert_forward
diff --git a/optimum/habana/transformers/models/albert/modeling_albert.py b/optimum/habana/transformers/models/albert/modeling_albert.py
deleted file mode 100644
index 6ac9b80073..0000000000
--- a/optimum/habana/transformers/models/albert/modeling_albert.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-import torch
-from transformers.modeling_outputs import BaseModelOutputWithPooling
-
-
-def gaudi_albert_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    token_type_ids: Optional[torch.LongTensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-) -> Union[BaseModelOutputWithPooling, Tuple]:
-    """
-    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/models/albert/modeling_albert.py#L689
-    except that mixed precision is disabled for computing:
-        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-        input_shape = input_ids.size()
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    batch_size, seq_length = input_shape
-    device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-    if attention_mask is None:
-        attention_mask = torch.ones(input_shape, device=device)
-    if token_type_ids is None:
-        if hasattr(self.embeddings, "token_type_ids"):
-            buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-            buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
-            token_type_ids = buffered_token_type_ids_expanded
-        else:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-    extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-    # torch.finfo must take the dtype of encoder_extended_attention_mask
-    extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # bf16 compatibility
-    extended_attention_mask = 1.0 - extended_attention_mask
-    with torch.autocast(enabled=False, device_type="hpu"):
-        extended_attention_mask = extended_attention_mask * torch.finfo(extended_attention_mask.dtype).min
-    head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-    embedding_output = self.embeddings(
-        input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-    )
-    encoder_outputs = self.encoder(
-        embedding_output,
-        extended_attention_mask,
-        head_mask=head_mask,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-    )
-
-    sequence_output = encoder_outputs[0]
-
-    pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None
-
-    if not return_dict:
-        return (sequence_output, pooled_output) + encoder_outputs[1:]
-
-    return BaseModelOutputWithPooling(
-        last_hidden_state=sequence_output,
-        pooler_output=pooled_output,
-        hidden_states=encoder_outputs.hidden_states,
-        attentions=encoder_outputs.attentions,
-    )
diff --git a/optimum/habana/transformers/models/bart/__init__.py b/optimum/habana/transformers/models/bart/__init__.py
deleted file mode 100644
index c8148194d8..0000000000
--- a/optimum/habana/transformers/models/bart/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from .modeling_bart import (
-    gaudi_BartAttention_forward,
-    gaudi_BartDecoder_forward,
-    gaudi_BartDecoderLayer_forward,
-    gaudi_BartEncoder_forward,
-    gaudi_BartEncoderLayer_forward,
-    gaudi_BartForConditionalGeneration_forward,
-    gaudi_BartForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_BartLearnedPositionalEmbedding,
-    gaudi_BartModel_forward,
-)
diff --git a/optimum/habana/transformers/models/bart/modeling_bart.py b/optimum/habana/transformers/models/bart/modeling_bart.py
deleted file mode 100644
index f68b1e3f81..0000000000
--- a/optimum/habana/transformers/models/bart/modeling_bart.py
+++ /dev/null
@@ -1,801 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BART model."""
-
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_attn_mask_utils import (
-    _prepare_4d_attention_mask,
-    _prepare_4d_attention_mask_for_sdpa,
-    _prepare_4d_causal_attention_mask_for_sdpa,
-)
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput,
-    Seq2SeqModelOutput,
-)
-from transformers.models.bart.modeling_bart import shift_tokens_right
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-# Copied from modeling_bart.py: https://raw.githubusercontent.com/huggingface/transformers/648d0deb1dd28a5d9956e63d8cf8c18f96a6a2aa/src/transformers/models/bart/modeling_bart.py
-# The difference is: modified dynamic shapes to static shapes with `mark_step` for performance improvement.
-
-
-class gaudi_BartLearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    """
-
-    def __init__(self, num_embeddings: int, embedding_dim: int):
-        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models don't have this hack
-        self.offset = 2
-        super().__init__(num_embeddings + self.offset, embedding_dim)
-
-    def forward(self, input_ids: torch.Tensor, past_key_values_length: torch.Tensor = torch.tensor(0)):
-        """`input_ids' shape is expected to be [bsz x seqlen]."""
-
-        bsz, seq_len = input_ids.shape[:2]
-        positions = torch.arange(0, seq_len, dtype=torch.long, device=self.weight.device).expand(bsz, -1)
-        positions += past_key_values_length
-
-        return super().forward(positions + self.offset)
-
-
-def gaudi_BartAttention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    key_value_states: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    layer_head_mask: Optional[torch.Tensor] = None,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """Input shape: Batch x Time x Channel"""
-
-    # if key_value_states are provided this layer is used as a cross-attention layer
-    # for the decoder
-    is_cross_attention = key_value_states is not None
-
-    bsz, tgt_len, _ = hidden_states.size()
-
-    # get query proj
-    query_states = self.q_proj(hidden_states) * self.scaling
-    # get key, value proj
-    # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-    # is checking that the `sequence_length` of the `past_key_value` is the same as
-    # the provided `key_value_states` to support prefix tuning
-    if is_cross_attention and past_key_value is not None and past_key_value[0].shape[2] == key_value_states.shape[1]:
-        # reuse k,v, cross_attentions
-        key_states = past_key_value[0]
-        value_states = past_key_value[1]
-    elif is_cross_attention:
-        # cross_attentions
-        key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-        value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-    elif past_key_value is not None:
-        # reuse k, v, self_attention
-        present_key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        present_value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        if token_idx is not None:
-            # HPU bug WA
-            key_states, value_states = past_key_value
-            key_states.index_add_(
-                2, token_idx - 1, present_key_states - torch.index_select(key_states, 2, token_idx - 1)
-            )
-            value_states.index_add_(
-                2, token_idx - 1, present_value_states - torch.index_select(value_states, 2, token_idx - 1)
-            )
-        else:
-            key_states = torch.cat([past_key_value[0], present_key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], present_value_states], dim=2)
-    else:
-        # self_attention
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-    if self.is_decoder:
-        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-        # Further calls to cross_attention layer can then reuse all cross-attention
-        # key/value_states (first "if" case)
-        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-        # all previous decoder key/value_states. Further calls to uni-directional self-attention
-        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-        # if encoder bi-directional self-attention `past_key_value` is always `None`
-        past_key_value = (key_states, value_states)
-
-    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-    key_states = key_states.reshape(*proj_shape)
-    value_states = value_states.reshape(*proj_shape)
-
-    src_len = key_states.size(1)
-    attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-    if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-        raise ValueError(
-            f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-            f" {attn_weights.size()}"
-        )
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-            )
-        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-    if layer_head_mask is not None:
-        if layer_head_mask.size() != (self.num_heads,):
-            raise ValueError(
-                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                f" {layer_head_mask.size()}"
-            )
-        attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-    if output_attentions:
-        # this operation is a bit awkward, but it's required to
-        # make sure that attn_weights keeps its gradient.
-        # In order to do so, attn_weights have to be reshaped
-        # twice and have to be reused in the following
-        attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-    else:
-        attn_weights_reshaped = None
-
-    attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-    attn_output = torch.bmm(attn_probs, value_states)
-
-    if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-    attn_output = attn_output.transpose(1, 2)
-
-    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-    # partitioned across GPUs when using tensor-parallelism.
-    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-    attn_output = self.out_proj(attn_output)
-
-    return attn_output, attn_weights_reshaped, past_key_value
-
-
-def gaudi_BartEncoderLayer_forward(
-    self,
-    hidden_states: torch.FloatTensor,
-    attention_mask: torch.FloatTensor,
-    layer_head_mask: torch.FloatTensor,
-    output_attentions: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
-    residual = hidden_states
-    hidden_states, attn_weights, _ = self.self_attn(
-        hidden_states=hidden_states,
-        attention_mask=attention_mask,
-        layer_head_mask=layer_head_mask,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-    hidden_states = residual + hidden_states
-    hidden_states = self.self_attn_layer_norm(hidden_states)
-
-    residual = hidden_states
-    hidden_states = self.activation_fn(self.fc1(hidden_states))
-    hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-    hidden_states = self.fc2(hidden_states)
-    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-    hidden_states = residual + hidden_states
-    hidden_states = self.final_layer_norm(hidden_states)
-
-    outputs = (hidden_states,)
-
-    if output_attentions:
-        outputs += (attn_weights,)
-
-    return outputs
-
-
-def gaudi_BartDecoderLayer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.Tensor] = None,
-    layer_head_mask: Optional[torch.Tensor] = None,
-    cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,
-    use_cache: Optional[bool] = True,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    residual = hidden_states
-
-    # Self Attention
-    # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-    # add present self-attn cache to positions 1,2 of present_key_value tuple
-    hidden_states, self_attn_weights, present_key_value = self.self_attn(
-        hidden_states=hidden_states,
-        past_key_value=self_attn_past_key_value,
-        attention_mask=attention_mask,
-        layer_head_mask=layer_head_mask,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-    hidden_states = residual + hidden_states
-    hidden_states = self.self_attn_layer_norm(hidden_states)
-
-    # Cross-Attention Block
-    cross_attn_present_key_value = None
-    cross_attn_weights = None
-    if encoder_hidden_states is not None:
-        residual = hidden_states
-
-        # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-        if use_cache:
-            cross_attn_past_key_value = (
-                past_key_value[-2:] if (past_key_value is not None and past_key_value[-2:] != (None, None)) else None
-            )
-        else:
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-
-        hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-            hidden_states=hidden_states,
-            key_value_states=encoder_hidden_states,
-            attention_mask=encoder_attention_mask,
-            layer_head_mask=cross_attn_layer_head_mask,
-            past_key_value=cross_attn_past_key_value,
-            output_attentions=output_attentions,
-            token_idx=token_idx,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-        # add cross-attn to positions 3,4 of present_key_value tuple
-        present_key_value = present_key_value + cross_attn_present_key_value
-
-    # Fully Connected
-    residual = hidden_states
-    hidden_states = self.activation_fn(self.fc1(hidden_states))
-    hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-    hidden_states = self.fc2(hidden_states)
-    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-    hidden_states = residual + hidden_states
-    hidden_states = self.final_layer_norm(hidden_states)
-
-    outputs = (hidden_states,)
-
-    if output_attentions:
-        outputs += (self_attn_weights, cross_attn_weights)
-
-    if use_cache:
-        outputs += (present_key_value,)
-
-    return outputs
-
-
-def gaudi_BartEncoder_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutput]:
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        input = input_ids
-        input_ids = input_ids.view(-1, input_ids.shape[-1])
-    elif inputs_embeds is not None:
-        input = inputs_embeds[:, :, -1]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
-    embed_pos = self.embed_positions(input)
-    import habana_frameworks.torch.core as htcore
-
-    htcore.mark_step()
-    embed_pos = embed_pos.to(inputs_embeds.device)
-
-    hidden_states = inputs_embeds + embed_pos
-    hidden_states = self.layernorm_embedding(hidden_states)
-    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-    # expand attention_mask
-    if attention_mask is not None:
-        if self._use_sdpa and head_mask is None and not output_attentions:
-            # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
-            # the manual implementation that requires a 4D causal mask in all cases.
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
-        else:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
-
-    encoder_states = () if output_hidden_states else None
-    all_attentions = () if output_attentions else None
-
-    # check if head_mask has a correct number of layers specified if desired
-    if head_mask is not None:
-        if head_mask.size()[0] != (len(self.layers)):
-            raise ValueError(
-                f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
-                f" {head_mask.size()[0]}."
-            )
-
-    for idx, encoder_layer in enumerate(self.layers):
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        to_drop = False
-        if self.training:
-            dropout_probability = torch.rand([])
-            if dropout_probability < self.layerdrop:  # skip the layer
-                to_drop = True
-
-        if to_drop:
-            layer_outputs = (None, None)
-        else:
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    encoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    (head_mask[idx] if head_mask is not None else None),
-                    output_attentions,
-                    None,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                    output_attentions=output_attentions,
-                    token_idx=token_idx,
-                )
-
-            hidden_states = layer_outputs[0]
-
-        if output_attentions:
-            all_attentions = all_attentions + (layer_outputs[1],)
-
-    if output_hidden_states:
-        encoder_states = encoder_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-    return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions)
-
-
-def gaudi_BartDecoder_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    encoder_attention_mask: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    cross_attn_head_mask: Optional[torch.Tensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-    elif input_ids is not None:
-        input = input_ids
-        input_shape = input.shape
-        input_ids = input_ids.view(-1, input_shape[-1])
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-        input = inputs_embeds[:, :, -1]
-    else:
-        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-    # past_key_values_length
-    past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-    tensor_past_key_values_length = token_idx - 1 if use_cache else torch.tensor(past_key_values_length)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input) * self.embed_scale
-
-    if self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
-        # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
-        # the manual implementation that requires a 4D causal mask in all cases.
-        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-            attention_mask,
-            input_shape,
-            inputs_embeds,
-            past_key_values_length,
-        )
-    else:
-        # 4d mask is passed through the layers
-        attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
-        )
-
-    # expand encoder attention mask
-    if encoder_hidden_states is not None and encoder_attention_mask is not None:
-        if self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
-            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
-                encoder_attention_mask,
-                inputs_embeds.dtype,
-                tgt_len=input_shape[-1],
-            )
-        else:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _prepare_4d_attention_mask(
-                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-            )
-
-    # embed positions
-    import habana_frameworks.torch.core as htcore
-
-    htcore.mark_step()
-    positions = self.embed_positions(input, tensor_past_key_values_length)
-
-    htcore.mark_step()
-    positions = positions.to(inputs_embeds.device)
-
-    hidden_states = inputs_embeds + positions
-    hidden_states = self.layernorm_embedding(hidden_states)
-
-    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-    next_decoder_cache = () if use_cache else None
-
-    # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-    for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
-        if attn_mask is not None:
-            if attn_mask.size()[0] != (len(self.layers)):
-                raise ValueError(
-                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                    f" {head_mask.size()[0]}."
-                )
-
-    for idx, decoder_layer in enumerate(self.layers):
-        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        if self.training:
-            dropout_probability = torch.rand([])
-            if dropout_probability < self.layerdrop:
-                continue
-
-        past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                head_mask[idx] if head_mask is not None else None,
-                cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
-                None,
-                output_attentions,
-                use_cache,
-                None,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                token_idx=token_idx,
-            )
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-            if encoder_hidden_states is not None:
-                all_cross_attentions += (layer_outputs[2],)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = next_decoder_cache if use_cache else None
-    if not return_dict:
-        return tuple(
-            v
-            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
-            if v is not None
-        )
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-        cross_attentions=all_cross_attentions,
-    )
-
-
-def gaudi_BartModel_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    decoder_input_ids: Optional[torch.LongTensor] = None,
-    decoder_attention_mask: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    decoder_head_mask: Optional[torch.Tensor] = None,
-    cross_attn_head_mask: Optional[torch.Tensor] = None,
-    encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, Seq2SeqModelOutput]:
-    # different to other models, Bart automatically creates decoder_input_ids from
-    # input_ids if no decoder_input_ids are provided
-    if decoder_input_ids is None and decoder_inputs_embeds is None:
-        if input_ids is None:
-            raise ValueError(
-                "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
-                "passed, `input_ids` cannot be `None`. Please pass either "
-                "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
-            )
-
-        decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id, self.config.decoder_start_token_id)
-
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if encoder_outputs is None:
-        encoder_outputs = self.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-    # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-        encoder_outputs = BaseModelOutput(
-            last_hidden_state=encoder_outputs[0],
-            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-        )
-
-    # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-    decoder_outputs = self.decoder(
-        input_ids=decoder_input_ids,
-        attention_mask=decoder_attention_mask,
-        encoder_hidden_states=encoder_outputs[0],
-        encoder_attention_mask=attention_mask,
-        head_mask=decoder_head_mask,
-        cross_attn_head_mask=cross_attn_head_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=decoder_inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        token_idx=token_idx,
-    )
-
-    if not return_dict:
-        return decoder_outputs + encoder_outputs
-
-    return Seq2SeqModelOutput(
-        last_hidden_state=decoder_outputs.last_hidden_state,
-        past_key_values=decoder_outputs.past_key_values,
-        decoder_hidden_states=decoder_outputs.hidden_states,
-        decoder_attentions=decoder_outputs.attentions,
-        cross_attentions=decoder_outputs.cross_attentions,
-        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-        encoder_hidden_states=encoder_outputs.hidden_states,
-        encoder_attentions=encoder_outputs.attentions,
-    )
-
-
-def gaudi_BartForConditionalGeneration_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    decoder_input_ids: Optional[torch.LongTensor] = None,
-    decoder_attention_mask: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    decoder_head_mask: Optional[torch.Tensor] = None,
-    cross_attn_head_mask: Optional[torch.Tensor] = None,
-    encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, Seq2SeqLMOutput]:
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if labels is not None:
-        if use_cache:
-            logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
-        use_cache = False
-        if decoder_input_ids is None and decoder_inputs_embeds is None:
-            decoder_input_ids = shift_tokens_right(
-                labels, self.config.pad_token_id, self.config.decoder_start_token_id
-            )
-
-    outputs = self.model(
-        input_ids,
-        attention_mask=attention_mask,
-        decoder_input_ids=decoder_input_ids,
-        encoder_outputs=encoder_outputs,
-        decoder_attention_mask=decoder_attention_mask,
-        head_mask=head_mask,
-        decoder_head_mask=decoder_head_mask,
-        cross_attn_head_mask=cross_attn_head_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        decoder_inputs_embeds=decoder_inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        token_idx=token_idx,
-    )
-
-    lm_logits = self.lm_head(outputs[0])
-    lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)
-
-    masked_lm_loss = None
-    if labels is not None:
-        labels = labels.to(lm_logits.device)
-        loss_fct = CrossEntropyLoss()
-        masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-
-    if not return_dict:
-        output = (lm_logits,) + outputs[1:]
-        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-    return Seq2SeqLMOutput(
-        loss=masked_lm_loss,
-        logits=lm_logits,
-        past_key_values=outputs.past_key_values,
-        decoder_hidden_states=outputs.decoder_hidden_states,
-        decoder_attentions=outputs.decoder_attentions,
-        cross_attentions=outputs.cross_attentions,
-        encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-        encoder_hidden_states=outputs.encoder_hidden_states,
-        encoder_attentions=outputs.encoder_attentions,
-    )
-
-
-def gaudi_BartForConditionalGeneration_prepare_inputs_for_generation(
-    self,
-    decoder_input_ids,
-    past_key_values=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-    use_cache=None,
-    encoder_outputs=None,
-    token_idx=None,
-    **kwargs,
-):
-    # cut decoder_input_ids if past_key_values is used
-    if past_key_values is not None:
-        if token_idx is not None:
-            decoder_input_ids = torch.index_select(decoder_input_ids, 1, token_idx - 1)
-        else:
-            decoder_input_ids = decoder_input_ids[:, -1:].unsqueeze(-1)
-
-    return {
-        "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-        "encoder_outputs": encoder_outputs,
-        "past_key_values": past_key_values,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-        "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        "token_idx": token_idx,
-    }
diff --git a/optimum/habana/transformers/models/blip/__init__.py b/optimum/habana/transformers/models/blip/__init__.py
deleted file mode 100644
index 6f105b11a2..0000000000
--- a/optimum/habana/transformers/models/blip/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .modeling_blip import gaudi_BlipForConditionalGeneration_generate, gaudi_BlipForQuestionAnswering_generate
-from .modeling_blip_text import (
-    gaudi_BlipTextAttention_forward,
-    gaudi_BlipTextEncoder_forward,
-    gaudi_BlipTextLayer_forward,
-    gaudi_BlipTextLMHead_forward,
-    gaudi_BlipTextLMHead_prepare_inputs_for_generation,
-    gaudi_BlipTextModel_forward,
-    gaudi_BlipTextSelfAttention_forward,
-)
diff --git a/optimum/habana/transformers/models/blip/modeling_blip.py b/optimum/habana/transformers/models/blip/modeling_blip.py
deleted file mode 100644
index 6545a0662d..0000000000
--- a/optimum/habana/transformers/models/blip/modeling_blip.py
+++ /dev/null
@@ -1,124 +0,0 @@
-from typing import Optional
-
-import torch
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-@torch.no_grad()
-def gaudi_BlipForConditionalGeneration_generate(
-    self,
-    pixel_values: torch.FloatTensor,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.LongTensor] = None,
-    **generate_kwargs,
-) -> torch.LongTensor:
-    """
-    Copied from BlipForQuestionAnswering.generate: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip.py#L1022
-    The only differences are:
-        - wrap hpu graph for each part
-    """
-    if generate_kwargs.get("hpu_graphs", True):
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        if not hasattr(self.vision_model, "clear_cache"):
-            self.vision_model = wrap_in_hpu_graph(self.vision_model)
-        if not hasattr(self.text_decoder, "clear_cache"):
-            self.text_decoder = wrap_in_hpu_graph(self.text_decoder)
-
-    batch_size = pixel_values.shape[0]
-    vision_outputs = self.vision_model(pixel_values=pixel_values)
-
-    image_embeds = vision_outputs[0]
-
-    image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
-
-    if isinstance(input_ids, list):
-        input_ids = torch.LongTensor(input_ids)
-    elif input_ids is None:
-        input_ids = (
-            torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]])
-            .repeat(batch_size, 1)
-            .to(image_embeds.device)
-        )
-
-    input_ids[:, 0] = self.config.text_config.bos_token_id
-    attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
-
-    outputs = self.text_decoder.generate(
-        input_ids=input_ids[:, :-1],
-        eos_token_id=self.config.text_config.sep_token_id,
-        pad_token_id=self.config.text_config.pad_token_id,
-        attention_mask=attention_mask,
-        encoder_hidden_states=image_embeds,
-        encoder_attention_mask=image_attention_mask,
-        **generate_kwargs,
-    )
-
-    return outputs
-
-
-@torch.no_grad()
-def gaudi_BlipForQuestionAnswering_generate(
-    self,
-    input_ids: torch.LongTensor,
-    pixel_values: torch.FloatTensor,
-    attention_mask: Optional[torch.LongTensor] = None,
-    **generate_kwargs,
-) -> torch.LongTensor:
-    """
-    Copied from BlipForQuestionAnswering.generate: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip.py#L1236
-    The only differences are:
-        - wrap hpu graph for each part
-        - torch.full add dtype=torch.int64, or else the default type is torch.float32. lead to coredump in embeding layer
-    """
-    if generate_kwargs.get("hpu_graphs", True):
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        if not hasattr(self.vision_model, "clear_cache"):
-            self.vision_model = wrap_in_hpu_graph(self.vision_model)
-        if not hasattr(self.text_encoder, "clear_cache"):
-            self.text_encoder = wrap_in_hpu_graph(self.text_encoder)
-        if not hasattr(self.text_decoder, "clear_cache"):
-            self.text_decoder = wrap_in_hpu_graph(self.text_decoder)
-
-    vision_outputs = self.vision_model(pixel_values=pixel_values)
-
-    image_embeds = vision_outputs[0]
-
-    image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
-
-    if isinstance(input_ids, list):
-        input_ids = torch.LongTensor(input_ids)
-
-    question_outputs = self.text_encoder(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        encoder_hidden_states=image_embeds,
-        encoder_attention_mask=image_attention_mask,
-        return_dict=False,
-    )
-
-    question_embeds = question_outputs[0]
-
-    question_attention_mask = torch.ones(question_embeds.size()[:-1], dtype=torch.long).to(question_embeds.device)
-
-    bos_ids = torch.full(
-        (question_embeds.size(0), 1),
-        fill_value=self.decoder_start_token_id,
-        device=question_embeds.device,
-        dtype=torch.int64,
-    )
-
-    outputs = self.text_decoder.generate(
-        input_ids=bos_ids,
-        eos_token_id=self.config.text_config.sep_token_id,
-        pad_token_id=self.config.text_config.pad_token_id,
-        encoder_hidden_states=question_embeds,
-        encoder_attention_mask=question_attention_mask,
-        **generate_kwargs,
-    )
-
-    return outputs
diff --git a/optimum/habana/transformers/models/blip/modeling_blip_text.py b/optimum/habana/transformers/models/blip/modeling_blip_text.py
deleted file mode 100644
index 23d4ee3f3c..0000000000
--- a/optimum/habana/transformers/models/blip/modeling_blip_text.py
+++ /dev/null
@@ -1,538 +0,0 @@
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-)
-from transformers.modeling_utils import apply_chunking_to_forward
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_BlipTextSelfAttention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    encoder_attention_mask: Optional[torch.FloatTensor] = None,
-    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    output_attentions: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor]:
-    """
-    Copied from BlipTextSelfAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L143
-    The only differences are:
-        - add token_idx
-    """
-    mixed_query_layer = self.query(hidden_states)
-
-    # If this is instantiated as a cross-attention module, the keys
-    # and values come from an encoder; the attention mask needs to be
-    # such that the encoder's padding tokens are not attended to.
-    is_cross_attention = encoder_hidden_states is not None
-
-    if is_cross_attention:
-        key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-        value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-        attention_mask = encoder_attention_mask
-    elif past_key_value is not None:
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        if token_idx is not None:
-            past_key_value[0].index_copy_(2, token_idx - 1, key_layer)
-            past_key_value[1].index_copy_(2, token_idx - 1, value_layer)
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-        else:
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-    else:
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-    query_layer = self.transpose_for_scores(mixed_query_layer)
-
-    past_key_value = (key_layer, value_layer)
-
-    # Take the dot product between "query" and "key" to get the raw attention scores.
-    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-    if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-        seq_length = hidden_states.size()[1]
-        position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-        position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-        distance = position_ids_l - position_ids_r
-        positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-        positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-
-        if self.position_embedding_type == "relative_key":
-            relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-            attention_scores = attention_scores + relative_position_scores
-        elif self.position_embedding_type == "relative_key_query":
-            relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-            relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-            attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-    attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-    if attention_mask is not None:
-        # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function)
-        attention_scores = attention_scores + attention_mask.to(attention_scores.device)
-
-    # Normalize the attention scores to probabilities.
-    attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_probs_dropped = self.dropout(attention_probs)
-
-    # Mask heads if we want to
-    if head_mask is not None:
-        attention_probs_dropped = attention_probs_dropped * head_mask
-
-    context_layer = torch.matmul(attention_probs_dropped, value_layer)
-
-    context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-    new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-    context_layer = context_layer.view(*new_context_layer_shape)
-
-    outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-    outputs = outputs + (past_key_value,)
-    return outputs
-
-
-def gaudi_BlipTextAttention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    encoder_attention_mask: Optional[torch.FloatTensor] = None,
-    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    output_attentions: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor]:
-    """
-    Copied from BlipTextAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L265
-    The only differences are:
-        - add token_idx
-    """
-    self_outputs = self.self(
-        hidden_states,
-        attention_mask,
-        head_mask,
-        encoder_hidden_states,
-        encoder_attention_mask,
-        past_key_value,
-        output_attentions,
-        token_idx=token_idx,
-    )
-    attention_output = self.output(self_outputs[0], hidden_states)
-    outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-    return outputs
-
-
-def gaudi_BlipTextLayer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    encoder_attention_mask: Optional[torch.FloatTensor] = None,
-    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    output_attentions: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor]:
-    """
-    Copied from BlipTextLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L333
-    The only differences are:
-        - add token_idx
-    """
-    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-    self_attention_outputs = self.attention(
-        hidden_states,
-        attention_mask,
-        head_mask,
-        output_attentions=output_attentions,
-        past_key_value=self_attn_past_key_value,
-        token_idx=token_idx,
-    )
-    attention_output = self_attention_outputs[0]
-
-    outputs = self_attention_outputs[1:-1]
-    present_key_value = self_attention_outputs[-1]
-
-    if encoder_hidden_states is not None:
-        cross_attention_outputs = self.crossattention(
-            attention_output,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            output_attentions=output_attentions,
-        )
-        attention_output = cross_attention_outputs[0]
-        outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
-    layer_output = apply_chunking_to_forward(
-        self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
-    )
-    outputs = (layer_output,) + outputs
-
-    outputs = outputs + (present_key_value,)
-
-    return outputs
-
-
-def gaudi_BlipTextEncoder_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    encoder_attention_mask: Optional[torch.FloatTensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = False,
-    output_hidden_states: Optional[bool] = False,
-    return_dict: Optional[bool] = True,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
-    """
-    Copied from BlipTextEncoder.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L391
-    The only differences are:
-        - add token_idx
-    """
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attentions = () if output_attentions else None
-    all_cross_attentions = () if output_attentions and self.config.is_decoder else None
-
-    next_decoder_cache = () if use_cache else None
-
-    for i in range(self.config.num_hidden_layers):
-        layer_module = self.layer[i]
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        layer_head_mask = head_mask[i] if head_mask is not None else None
-        past_key_value = past_key_values[i] if past_key_values is not None else None
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                layer_module.__call__,
-                hidden_states,
-                attention_mask,
-                layer_head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                past_key_value,
-                output_attentions,
-            )
-        else:
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask,
-                layer_head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                past_key_value,
-                output_attentions,
-                token_idx=token_idx,
-            )
-
-        hidden_states = layer_outputs[0]
-        if use_cache:
-            next_decoder_cache += (layer_outputs[-1],)
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (layer_outputs[1],)
-            all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(
-            v
-            for v in [
-                hidden_states,
-                next_decoder_cache,
-                all_hidden_states,
-                all_self_attentions,
-                all_cross_attentions,
-            ]
-            if v is not None
-        )
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=next_decoder_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-        cross_attentions=all_cross_attentions,
-    )
-
-
-def gaudi_BlipTextModel_forward(
-    self,
-    input_ids: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    inputs_embeds: Optional[torch.Tensor] = None,
-    encoder_embeds: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.Tensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    is_decoder: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
-    """
-    Copied from BlipTextModel.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L666
-    The only differences are:
-        - add token_idx
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if is_decoder:
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-    else:
-        use_cache = False
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-        input_shape = input_ids.size()
-        batch_size, seq_length = input_shape
-        device = input_ids.device
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-        batch_size, seq_length = input_shape
-        device = inputs_embeds.device
-    elif encoder_embeds is not None:
-        input_shape = encoder_embeds.size()[:-1]
-        batch_size, seq_length = input_shape
-        device = encoder_embeds.device
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
-
-    # past_key_values_length
-    past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-    if attention_mask is None:
-        attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length))).to(device)
-
-    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-    # ourselves in which case we just need to make it broadcastable to all heads.
-    extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
-        attention_mask, input_shape, device, is_decoder
-    )
-
-    # If a 2D or 3D attention mask is provided for the cross-attention
-    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-    if encoder_hidden_states is not None:
-        if isinstance(encoder_hidden_states, list):
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
-        else:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-
-        if isinstance(encoder_attention_mask, list):
-            encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
-        elif encoder_attention_mask is None:
-            encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-    else:
-        encoder_extended_attention_mask = None
-
-    # Prepare head mask if needed
-    # 1.0 in head_mask indicate we keep the head
-    # attention_probs has shape bsz x n_heads x N x N
-    # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-    # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-    head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-    if encoder_embeds is None:
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
-    else:
-        embedding_output = encoder_embeds
-
-    encoder_outputs = self.encoder(
-        embedding_output,
-        attention_mask=extended_attention_mask,
-        head_mask=head_mask,
-        encoder_hidden_states=encoder_hidden_states,
-        encoder_attention_mask=encoder_extended_attention_mask,
-        past_key_values=past_key_values,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        token_idx=token_idx,
-    )
-    sequence_output = encoder_outputs[0]
-    pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-
-    if not return_dict:
-        return (sequence_output, pooled_output) + encoder_outputs[1:]
-
-    return BaseModelOutputWithPoolingAndCrossAttentions(
-        last_hidden_state=sequence_output,
-        pooler_output=pooled_output,
-        past_key_values=encoder_outputs.past_key_values,
-        hidden_states=encoder_outputs.hidden_states,
-        attentions=encoder_outputs.attentions,
-        cross_attentions=encoder_outputs.cross_attentions,
-    )
-
-
-def gaudi_BlipTextLMHead_forward(
-    self,
-    input_ids: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    inputs_embeds: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.Tensor] = None,
-    labels: Optional[torch.Tensor] = None,
-    past_key_values: Optional[List[torch.Tensor]] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    return_logits: Optional[bool] = False,
-    is_decoder: Optional[bool] = True,
-    reduction: Optional[str] = "mean",
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-    """
-    Copied from BlipTextLMHeadModel.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L820
-    The only differences are:
-        - add token_idx
-    """
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-    if labels is not None:
-        use_cache = False
-
-    outputs = self.bert(
-        input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        head_mask=head_mask,
-        inputs_embeds=inputs_embeds,
-        encoder_hidden_states=encoder_hidden_states,
-        encoder_attention_mask=encoder_attention_mask,
-        past_key_values=past_key_values,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        is_decoder=is_decoder,
-        token_idx=token_idx,
-    )
-
-    sequence_output = outputs[0]
-    prediction_scores = self.cls(sequence_output)
-
-    if return_logits:
-        return prediction_scores[:, :-1, :].contiguous()
-
-    lm_loss = None
-    if labels is not None:
-        # we are doing next-token prediction; shift prediction scores and input ids by one
-        shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
-        labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device)
-        loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing)
-        lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-        if reduction == "none":
-            lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
-
-    if not return_dict:
-        output = (prediction_scores,) + outputs[2:]
-        return ((lm_loss,) + output) if lm_loss is not None else output
-
-    return CausalLMOutputWithCrossAttentions(
-        loss=lm_loss,
-        logits=prediction_scores,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        cross_attentions=outputs.cross_attentions,
-    )
-
-
-def gaudi_BlipTextLMHead_prepare_inputs_for_generation(
-    self, input_ids, past_key_values=None, attention_mask=None, token_idx=None, **model_kwargs
-):
-    """
-    Copied from BlipTextLMHeadModel.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L910
-    The only differences are:
-        - add token_idx support, add position_ids
-    """
-    input_shape = input_ids.shape
-    # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-    if attention_mask is None:
-        attention_mask = input_ids.new_ones(input_shape)
-
-    # cut decoder_input_ids if past_key_values is used
-    if past_key_values is not None:
-        if token_idx is not None:
-            input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-        else:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-    position_ids = None
-
-    if token_idx is not None and attention_mask is not None:
-        position_ids = attention_mask.long().cumsum(-1) - 1
-        position_ids.masked_fill_(attention_mask == 0, 1)
-        if past_key_values:
-            position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "past_key_values": past_key_values,
-        "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
-        "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
-        "is_decoder": True,
-        "token_idx": token_idx,
-        "position_ids": position_ids,
-    }
diff --git a/optimum/habana/transformers/models/bloom/__init__.py b/optimum/habana/transformers/models/bloom/__init__.py
deleted file mode 100644
index 8aa34e4145..0000000000
--- a/optimum/habana/transformers/models/bloom/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .modeling_bloom import (
-    GaudiBloomForCausalLM,
-    GaudiBloomMLP,
-    gaudi_bloom_attention_forward,
-    gaudi_bloom_block_forward,
-    gaudi_bloom_convert_to_bloom_cache,
-    gaudi_bloom_convert_to_standard_cache,
-    gaudi_bloom_model_forward,
-)
diff --git a/optimum/habana/transformers/models/bloom/modeling_bloom.py b/optimum/habana/transformers/models/bloom/modeling_bloom.py
deleted file mode 100644
index df99463c15..0000000000
--- a/optimum/habana/transformers/models/bloom/modeling_bloom.py
+++ /dev/null
@@ -1,612 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-###############################################################################
-# Copyright (C) 2022-2023 Habana Labs, Ltd. an Intel Company
-###############################################################################
-import math
-import os
-import warnings
-from typing import Optional, Tuple, Union
-
-import torch
-from torch.nn import CrossEntropyLoss
-from torch.nn import functional as F
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from transformers.models.bloom.modeling_bloom import BloomForCausalLM, BloomMLP, dropout_add
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_bloom_build_alibi_tensor(
-    attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype, training: bool
-) -> torch.Tensor:
-    """
-    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
-    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
-    `softmax(l+a) = softmax(l)`. Based on
-    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
-    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
-
-    Args:
-    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
-        attention_mask (`torch.Tensor`):
-            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`):
-            Number of heads.
-        dtype (`torch.dtype`):
-            Dtype of the output tensor.
-        training (`bool`):
-            Whether the model is being trained or not.
-    """
-    batch_size, seq_length = attention_mask.shape
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(
-        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-    )
-    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
-    slopes = torch.pow(base, powers)
-
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(
-            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32
-        )
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-
-    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
-    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
-    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
-    # => the query_length dimension will then be broadcasted correctly
-    # This is more or less identical to T5's relative position bias:
-    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
-    if training:
-        arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
-        alibi = slopes[..., None] * arange_tensor
-        return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
-    else:
-        # code taken from Megatron transformer.py
-        alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(seq_length, device=attention_mask.device).unsqueeze(
-            0
-        ).unsqueeze(0).expand(num_heads, -1, -1)
-
-        # Select the part of the tensor that corresponds to our tensor parallel index.
-        # if inference_tp_size is set use it instead of world size
-        world = int(os.environ.get("WORLD_SIZE", 1))
-        tp_world_size = GaudiBloomForCausalLM.inference_tp_size if GaudiBloomForCausalLM.inference_tp_size else world
-        tp_index = 0  # if world size == 1 ignore rank and use 0 (for cases where WORLD_SIZE is not equal to tp size)
-        if tp_world_size > 1:
-            tp_index = int(os.environ.get("RANK", 0))
-
-        alibi = alibi.reshape((tp_world_size, -1, *alibi.shape[1:]))[tp_index]
-
-        alibi = alibi.repeat(batch_size, 1, 1)
-        return alibi.to(dtype)
-
-
-def update(prev, cur, dim, idx):
-    if idx is not None:
-        if os.environ.get("WA_INDEX_COPY", "1") == "1":
-            past_selector, value_selector = idx
-            if dim == 1:
-                sel = torch.cat([past_selector, value_selector.unsqueeze(2)], dim=2)
-                val = torch.cat([prev, cur], dim=1)
-                return torch.bmm(sel, val)
-            else:
-                sel = torch.cat([past_selector, value_selector.unsqueeze(1)], dim=1)
-                val = torch.cat([prev, cur], dim=2)
-                return torch.bmm(val, sel)
-        else:
-            return prev.index_copy_(dim, idx - 1, cur)
-    else:
-        return torch.cat((prev, cur), dim=dim)
-
-
-def gaudi_bloom_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    residual: torch.Tensor,
-    alibi: torch.Tensor,
-    attention_mask: torch.Tensor,
-    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    use_cache: bool = False,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-):
-    fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-
-    # 3 x [batch_size, seq_length, num_heads, head_dim]
-    (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
-
-    batch_size, q_length, _, _ = query_layer.shape
-
-    query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-    key_layer = key_layer.permute(0, 2, 3, 1).reshape(batch_size * self.num_heads, self.head_dim, q_length)
-    value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-
-    # Collapse views to improve performance on HPU
-    query_layer = query_layer.contiguous()
-    key_layer = key_layer.contiguous()
-    value_layer = value_layer.contiguous()
-
-    if layer_past is not None:
-        past_key, past_value = layer_past
-        # concatenate along seq_length dimension:
-        #  - key: [batch_size * self.num_heads, head_dim, kv_length]
-        #  - value: [batch_size * self.num_heads, kv_length, head_dim]
-        key_layer = update(past_key, key_layer, 2, token_idx)
-        value_layer = update(past_value, value_layer, 1, token_idx)
-
-    _, _, kv_length = key_layer.shape
-
-    if use_cache is True:
-        present = (key_layer, value_layer)
-    else:
-        present = None
-
-    # [batch_size * num_heads, q_length, kv_length]
-    # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11
-    matmul_result = alibi.baddbmm(
-        batch1=query_layer,
-        batch2=key_layer,
-        beta=self.beta,
-        alpha=self.inv_norm_factor,
-    )
-
-    # change view to [batch_size, num_heads, q_length, kv_length]
-    attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)
-
-    # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-    input_dtype = attention_scores.dtype
-    attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
-    attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(input_dtype)
-
-    # [batch_size, num_heads, q_length, kv_length]
-    attention_probs = self.attention_dropout(attention_probs)
-
-    if head_mask is not None:
-        attention_probs = attention_probs * head_mask
-
-    # change view [batch_size x num_heads, q_length, kv_length]
-    attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, kv_length)
-
-    # matmul: [batch_size * num_heads, q_length, head_dim]
-    context_layer = torch.bmm(attention_probs_reshaped, value_layer)
-
-    # change view [batch_size, q_length, num_heads * head_dim]
-    context_layer = self._merge_heads(context_layer)
-
-    # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
-    if self.pretraining_tp > 1 and self.slow_but_exact:
-        slices = self.hidden_size / self.pretraining_tp
-        output_tensor = torch.zeros_like(context_layer)
-        for i in range(self.pretraining_tp):
-            output_tensor = output_tensor + F.linear(
-                context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
-                self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
-            )
-    else:
-        output_tensor = self.dense(context_layer)
-
-    output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
-
-    outputs = (output_tensor, present)
-    if output_attentions:
-        outputs += (attention_probs,)
-
-    return outputs
-
-
-class GaudiBloomMLP(BloomMLP):
-    def __init__(self, config):
-        super().__init__(config)
-        self.gelu_impl = torch.nn.GELU(approximate="tanh")
-
-
-def gaudi_bloom_block_forward(
-    self,
-    hidden_states: torch.Tensor,
-    alibi: torch.Tensor,
-    attention_mask: torch.Tensor,
-    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    use_cache: bool = False,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-):
-    # hidden_states: [batch_size, seq_length, hidden_size]
-
-    # Layer norm at the beginning of the transformer layer.
-    layernorm_output = self.input_layernorm(hidden_states)
-
-    # Layer norm post the self attention.
-    if self.apply_residual_connection_post_layernorm:
-        residual = layernorm_output
-    else:
-        residual = hidden_states
-
-    # Self attention.
-    attn_outputs = self.self_attention(
-        layernorm_output,
-        residual,
-        layer_past=layer_past,
-        attention_mask=attention_mask,
-        alibi=alibi,
-        head_mask=head_mask,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-
-    attention_output = attn_outputs[0]
-
-    outputs = attn_outputs[1:]
-
-    layernorm_output = self.post_attention_layernorm(attention_output)
-
-    # Get residual
-    if self.apply_residual_connection_post_layernorm:
-        residual = layernorm_output
-    else:
-        residual = attention_output
-
-    # MLP.
-    output = self.mlp(layernorm_output, residual)
-
-    if use_cache:
-        outputs = (output,) + outputs
-    else:
-        outputs = (output,) + outputs[1:]
-
-    return outputs  # hidden_states, present, attentions
-
-
-def gaudi_bloom_convert_to_standard_cache(
-    self, past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int, training: bool
-) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-    """
-    Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
-    num_heads, ...]))
-    """
-    batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
-    if training:
-        num_heads = batch_size_times_num_heads // batch_size
-    else:
-        world = int(os.environ.get("WORLD_SIZE", 1))
-        tp_world_size = GaudiBloomForCausalLM.inference_tp_size if GaudiBloomForCausalLM.inference_tp_size else world
-        num_heads = self.config.n_head // tp_world_size
-        batch_size = batch_size_times_num_heads // num_heads
-    # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
-    # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
-    return tuple(
-        (
-            layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
-            layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
-        )
-        for layer_past in past_key_value
-    )
-
-
-def gaudi_bloom_convert_to_bloom_cache(
-    self, past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
-) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-    """
-    Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
-    """
-    batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
-    batch_size_times_num_heads = batch_size * num_heads
-    # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
-    # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
-    return tuple(
-        (
-            layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
-            layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
-        )
-        for layer_past in past_key_value
-    )
-
-
-def gaudi_bloom_model_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.LongTensor] = None,
-    inputs_embeds: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-    **deprecated_arguments,
-) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-    if deprecated_arguments.pop("position_ids", False) is not False:
-        # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-        warnings.warn(
-            "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
-            " passing `position_ids`.",
-            FutureWarning,
-        )
-    if len(deprecated_arguments) > 0:
-        raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    if past_key_values is None:
-        past_key_values = tuple([None] * len(self.h))
-
-    # Prepare head mask if needed
-    # 1.0 in head_mask indicate we keep the head
-    # attention_probs has shape batch_size x num_heads x N x N
-    # head_mask has shape n_layer x batch x num_heads x N x N
-    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.word_embeddings(input_ids)
-
-    hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-
-    presents = () if use_cache else None
-    all_self_attentions = () if output_attentions else None
-    all_hidden_states = () if output_hidden_states else None
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # Compute alibi tensor: check gaudi_bloom_build_alibi_tensor
-    seq_length_with_past = seq_length
-    past_key_values_length = 0
-    if past_key_values[0] is not None:
-        past_key_values_length = past_key_values[0][0].shape[2]
-        seq_length_with_past = seq_length_with_past + past_key_values_length
-    if attention_mask is None:
-        attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-    else:
-        attention_mask = attention_mask.to(hidden_states.device)
-
-    alibi = gaudi_bloom_build_alibi_tensor(attention_mask, self.num_heads, hidden_states.dtype, self.training)
-
-    causal_mask = _gaudi_prepare_4d_causal_attention_mask(
-        attention_mask,
-        input_shape=(batch_size, seq_length),
-        inputs_embeds=inputs_embeds,
-        past_key_values_length=past_key_values_length,
-    )
-    causal_mask = causal_mask.bool()
-
-    if token_idx is not None and past_key_values[0] is not None and os.environ.get("WA_INDEX_COPY", "1") == "1":
-        pkv = past_key_values[0][0]
-        cur = torch.nn.functional.one_hot(torch.tile(token_idx - 1, (pkv.shape[0],)), pkv.shape[-1]).to(pkv.dtype)
-        past = torch.diag_embed(1 - cur)
-        token_idx = (past, cur)
-
-    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            outputs = self._gradient_checkpointing_func(
-                block.__call__,
-                hidden_states,
-                alibi,
-                causal_mask,
-                layer_past,
-                head_mask[i],
-                use_cache,
-                output_attentions,
-                None,
-            )
-        else:
-            outputs = block(
-                hidden_states,
-                layer_past=layer_past,
-                attention_mask=causal_mask,
-                head_mask=head_mask[i],
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                alibi=alibi,
-                token_idx=token_idx,
-            )
-
-        hidden_states = outputs[0]
-        if use_cache is True:
-            presents = presents + (outputs[1],)
-
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-    # Add last hidden state
-    hidden_states = self.ln_f(hidden_states)
-
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-    )
-
-
-class GaudiBloomForCausalLM(BloomForCausalLM):
-    inference_tp_size = None
-
-    def set_tp_for_inference(tp_for_inference: int):
-        world = int(os.environ.get("WORLD_SIZE", 1))
-        assert tp_for_inference == 1 or tp_for_inference == world, "only setting 1 (no tp) or world size is supported"
-        GaudiBloomForCausalLM.inference_tp_size = tp_for_inference
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> dict:
-        # only last tokens for input_ids if past is not None
-        if past_key_values is not None:
-            if token_idx is None:
-                input_ids = input_ids[:, -1].unsqueeze(-1)
-            else:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-
-            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
-            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
-                past_key_values = self._convert_to_bloom_cache(past_key_values)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        if deprecated_arguments.pop("position_ids", False) is not False:
-            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-            warnings.warn(
-                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
-                " passing `position_ids`.",
-                FutureWarning,
-            )
-        if len(deprecated_arguments) > 0:
-            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            batch_size, seq_length, vocab_size = shift_logits.shape
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    def _reorder_cache(
-        self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx), training=self.training)
-
-        # Get a copy of `beam_idx` on all the devices where we need those indices.
-        device_to_beam_idx = {
-            past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past
-        }
-        reordered_past = tuple(
-            (
-                layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]),
-                layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]),
-            )
-            for layer_past in standardized_past
-        )
-        return self._convert_to_bloom_cache(reordered_past)
diff --git a/optimum/habana/transformers/models/clip/__init__.py b/optimum/habana/transformers/models/clip/__init__.py
deleted file mode 100644
index faa3a3355b..0000000000
--- a/optimum/habana/transformers/models/clip/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modeling_clip import GaudiCLIPVisionEmbeddings
diff --git a/optimum/habana/transformers/models/clip/modeling_clip.py b/optimum/habana/transformers/models/clip/modeling_clip.py
deleted file mode 100644
index 604c878365..0000000000
--- a/optimum/habana/transformers/models/clip/modeling_clip.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import torch
-from transformers.models.clip.modeling_clip import CLIPVisionEmbeddings
-
-
-class GaudiCLIPVisionEmbeddings(CLIPVisionEmbeddings):
-    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
-        batch_size = pixel_values.shape[0]
-        target_dtype = self.patch_embedding.weight.dtype
-        # if HQT quantization enabled, remove the explicit cast to float8 to avoid HQT casting error
-        if "float8" in str(target_dtype) and pixel_values.device.type == "hpu":
-            target_dtype = torch.bfloat16
-        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
-        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-
-        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
-        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-        embeddings = embeddings + self.position_embedding(self.position_ids)
-        return embeddings
diff --git a/optimum/habana/transformers/models/codegen/__init__.py b/optimum/habana/transformers/models/codegen/__init__.py
deleted file mode 100644
index d433e24c8d..0000000000
--- a/optimum/habana/transformers/models/codegen/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_codegen import (
-    GaudiCodeGenAttention,
-    GaudiCodeGenForCausalLM,
-    gaudi_codegen_block_forward,
-    gaudi_codegen_model_forward,
-)
diff --git a/optimum/habana/transformers/models/codegen/modeling_codegen.py b/optimum/habana/transformers/models/codegen/modeling_codegen.py
deleted file mode 100644
index b568085971..0000000000
--- a/optimum/habana/transformers/models/codegen/modeling_codegen.py
+++ /dev/null
@@ -1,420 +0,0 @@
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.codegen.modeling_codegen import (
-    CodeGenAttention,
-    CodeGenForCausalLM,
-    apply_rotary_pos_emb,
-    logger,
-)
-
-
-class GaudiCodeGenAttention(CodeGenAttention):
-    def forward(
-        self,
-        hidden_states: Optional[torch.FloatTensor],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[
-        Tuple[torch.Tensor, Tuple[torch.Tensor]],
-        Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
-    ]:
-        """
-        Copied from CodeGenAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
-        The only differences are:
-        - add new args token_idx
-        - optimize KV cache
-        """
-        qkv = self.qkv_proj(hidden_states)
-        # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
-        mp_num = 4
-        qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))
-        local_dim = self.head_dim * self.num_attention_heads // mp_num
-        query, value, key = torch.split(qkv_split, local_dim, dim=-1)
-        query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
-        key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num)
-
-        value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
-        value = value.permute(0, 2, 1, 3)
-
-        embed_positions = self.embed_positions
-        if embed_positions.device != position_ids.device:
-            embed_positions = embed_positions.to(position_ids.device)
-            self.embed_positions = embed_positions
-
-        sincos = embed_positions[position_ids]
-        sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
-
-        if self.rotary_dim is not None:
-            k_rot = key[:, :, :, : self.rotary_dim]
-            k_pass = key[:, :, :, self.rotary_dim :]
-
-            q_rot = query[:, :, :, : self.rotary_dim]
-            q_pass = query[:, :, :, self.rotary_dim :]
-
-            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
-            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
-
-            key = torch.cat([k_rot, k_pass], dim=-1)
-            query = torch.cat([q_rot, q_pass], dim=-1)
-        else:
-            key = apply_rotary_pos_emb(key, sin, cos)
-            query = apply_rotary_pos_emb(query, sin, cos)
-
-        key = key.permute(0, 2, 1, 3)
-        query = query.permute(0, 2, 1, 3)
-
-        if layer_past is not None:
-            past_key = layer_past[0]
-            past_value = layer_past[1]
-            if token_idx is not None:
-                key = past_key.index_add_(2, token_idx - 1, key - torch.index_select(past_key, 2, token_idx - 1))
-                value = past_value.index_add_(
-                    2, token_idx - 1, value - torch.index_select(past_value, 2, token_idx - 1)
-                )
-            else:
-                key = torch.cat((past_key, key), dim=-2)
-                value = torch.cat((past_value, value), dim=-2)
-
-        if use_cache is True:
-            present = (key, value)
-        else:
-            present = None
-
-        # compute self-attention: V x Softmax(QK^T)
-        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
-        attn_output = self.out_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs  # a, present, (attentions)
-
-
-def gaudi_codegen_block_forward(
-    self,
-    hidden_states: Optional[torch.FloatTensor],
-    layer_past: Optional[Tuple[torch.Tensor]] = None,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = False,
-    output_attentions: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-    """
-    Copied from CodeGenBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
-    The only differences are:
-    - add new args token_idx
-    """
-    residual = hidden_states
-    hidden_states = self.ln_1(hidden_states)
-    attn_outputs = self.attn(
-        hidden_states=hidden_states,
-        layer_past=layer_past,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        head_mask=head_mask,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
-    outputs = attn_outputs[1:]
-
-    feed_forward_hidden_states = self.mlp(hidden_states)
-    hidden_states = attn_output + feed_forward_hidden_states + residual
-
-    if use_cache:
-        outputs = (hidden_states,) + outputs
-    else:
-        outputs = (hidden_states,) + outputs[1:]
-
-    return outputs  # hidden_states, present, (attentions)
-
-
-def gaudi_codegen_model_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    token_type_ids: Optional[torch.LongTensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from CodeGenBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
-    The only differences are:
-    - add new args token_idx
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-        batch_size = input_ids.shape[0]
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-        batch_size = inputs_embeds.shape[0]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-    if token_type_ids is not None:
-        token_type_ids = token_type_ids.view(-1, input_shape[-1])
-
-    if past_key_values is None:
-        past_length = 0
-        past_key_values = tuple([None] * len(self.h))
-    else:
-        past_length = past_key_values[0][0].size(-2)
-
-    if position_ids is None:
-        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-        position_ids = position_ids.unsqueeze(0)
-
-    # Attention mask.
-    if attention_mask is not None:
-        if batch_size <= 0:
-            raise ValueError("batch_size has to be defined and > 0")
-        attention_mask = attention_mask.view(batch_size, -1)
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        attention_mask = attention_mask[:, None, None, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and the dtype's smallest value for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-    # Prepare head mask if needed
-    # 1.0 in head_mask indicate we keep the head
-    # attention_probs has shape bsz x num_attention_heads x N x N
-    # head_mask has shape n_layer x batch x num_attention_heads x N x N
-    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.wte(input_ids)
-
-    hidden_states = inputs_embeds
-
-    if token_type_ids is not None:
-        token_type_embeds = self.wte(token_type_ids)
-        hidden_states = hidden_states + token_type_embeds
-
-    hidden_states = self.drop(hidden_states)
-
-    output_shape = input_shape + (hidden_states.size(-1),)
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
-                "`use_cache=False`..."
-            )
-            use_cache = False
-
-    presents = () if use_cache else None
-    all_self_attentions = () if output_attentions else None
-    all_hidden_states = () if output_hidden_states else None
-    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            outputs = self._gradient_checkpointing_func(
-                block.__call__,
-                hidden_states,
-                None,
-                attention_mask,
-                position_ids,
-                head_mask[i],
-                use_cache,
-                output_attentions,
-                None,
-            )
-        else:
-            outputs = block(
-                hidden_states=hidden_states,
-                layer_past=layer_past,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                head_mask=head_mask[i],
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                token_idx=token_idx,
-            )
-
-        hidden_states = outputs[0]
-        if use_cache is True:
-            presents = presents + (outputs[1],)
-
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-    hidden_states = self.ln_f(hidden_states)
-
-    hidden_states = hidden_states.view(output_shape)
-    # Add last hidden state
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-    )
-
-
-class GaudiCodeGenForCausalLM(CodeGenForCausalLM):
-    """
-    Inherits from CodeGenForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
-    The only differences are:
-    - add new args token_idx
-    - add token_idx into model_inputs
-    - when KV cache is enabled, slice next_input_ids from input_ids based on the token_idx
-    - when KV cache is enabled, slice next_position_ids from position_ids based on the token_idx
-    """
-
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, token_idx=None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # Omit tokens covered by past_key_values
-        if past_key_values:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-                if token_type_ids is not None:
-                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1)
-            else:
-                input_ids = input_ids[:, -1]
-                if token_type_ids is not None:
-                    token_type_ids = token_type_ids[:, -1]
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -1]
-
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "token_type_ids": token_type_ids,
-            "token_idx": token_idx,
-        }
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-        hidden_states = transformer_outputs[0]
-
-        # make sure sampling in fp16 works correctly and
-        # compute loss in fp32 to match with mesh-tf version
-        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
-        lm_logits = self.lm_head(hidden_states).to(torch.float32)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/optimum/habana/transformers/models/esm/__init__.py b/optimum/habana/transformers/models/esm/__init__.py
deleted file mode 100644
index ca83d98292..0000000000
--- a/optimum/habana/transformers/models/esm/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_esmfold import (
-    gaudi_esm_for_protein_folding_forward,
-    gaudi_esmfolding_trunk_forward,
-    gaudi_rot_matmul,
-    gaudi_rot_vec_mul,
-)
diff --git a/optimum/habana/transformers/models/esm/modeling_esmfold.py b/optimum/habana/transformers/models/esm/modeling_esmfold.py
deleted file mode 100644
index 88b6dac0d2..0000000000
--- a/optimum/habana/transformers/models/esm/modeling_esmfold.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-
-import torch
-from transformers.models.esm.modeling_esmfold import EsmForProteinFoldingOutput, categorical_lddt
-from transformers.models.esm.openfold_utils import (
-    compute_predicted_aligned_error,
-    compute_tm,
-    make_atom14_masks,
-)
-from transformers.utils import (
-    ContextManagers,
-)
-
-
-def gaudi_esmfolding_trunk_forward(self, seq_feats, pair_feats, true_aa, residx, mask, no_recycles):
-    """
-    Inputs:
-        seq_feats: B x L x C tensor of sequence features pair_feats: B x L x L x C tensor of pair features residx: B
-        x L long tensor giving the position in the sequence mask: B x L boolean tensor indicating valid residues
-
-    Output:
-        predicted_structure: B x L x (num_atoms_per_residue * 3) tensor wrapped in a Coordinates object
-
-    Copied from EsmFoldingTrunk.forward:
-    https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/modeling_esmfold.py
-    The change is:
-    - Add extra mark_step in trunk_iter for each block.
-    """
-
-    device = seq_feats.device
-    s_s_0 = seq_feats
-    s_z_0 = pair_feats
-
-    if no_recycles is None:
-        no_recycles = self.config.max_recycles
-    else:
-        if no_recycles < 0:
-            raise ValueError("Number of recycles must not be negative.")
-        no_recycles += 1  # First 'recycle' is just the standard forward pass through the model.
-
-    def trunk_iter(s, z, residx, mask):
-        z = z + self.pairwise_positional_embedding(residx, mask=mask)
-
-        for block in self.blocks:
-            s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size)
-            if s.device.type == "hpu":
-                import habana_frameworks.torch.core as htcore
-
-                htcore.mark_step()
-        return s, z
-
-    s_s = s_s_0
-    s_z = s_z_0
-    recycle_s = torch.zeros_like(s_s)
-    recycle_z = torch.zeros_like(s_z)
-    recycle_bins = torch.zeros(*s_z.shape[:-1], device=device, dtype=torch.int64)
-
-    for recycle_idx in range(no_recycles):
-        with ContextManagers([] if recycle_idx == no_recycles - 1 else [torch.no_grad()]):
-            # === Recycling ===
-            recycle_s = self.recycle_s_norm(recycle_s.detach()).to(device)
-            recycle_z = self.recycle_z_norm(recycle_z.detach()).to(device)
-            recycle_z += self.recycle_disto(recycle_bins.detach()).to(device)
-
-            s_s, s_z = trunk_iter(s_s_0 + recycle_s, s_z_0 + recycle_z, residx, mask)
-
-            # === Structure module ===
-            structure = self.structure_module(
-                {"single": self.trunk2sm_s(s_s), "pair": self.trunk2sm_z(s_z)},
-                true_aa,
-                mask.float(),
-            )
-
-            recycle_s = s_s
-            recycle_z = s_z
-            # Distogram needs the N, CA, C coordinates, and bin constants same as alphafold.
-            recycle_bins = self.distogram(
-                structure["positions"][-1][:, :, :3],
-                3.375,
-                21.375,
-                self.recycle_bins,
-            )
-
-    structure["s_s"] = s_s
-    structure["s_z"] = s_z
-
-    return structure
-
-
-def gaudi_esm_for_protein_folding_forward(
-    self,
-    input_ids: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.Tensor] = None,
-    masking_pattern: Optional[torch.Tensor] = None,
-    num_recycles: Optional[int] = None,
-) -> EsmForProteinFoldingOutput:
-    r"""
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, EsmForProteinFolding
-
-    >>> model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
-    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
-    >>> inputs = tokenizer(["MLKNVQVQLV"], return_tensors="pt", add_special_tokens=False)  # A tiny random peptide
-    >>> outputs = model(**inputs)
-    >>> folded_positions = outputs.positions
-    ```
-
-    Copied from EsmForProteinFolding.forward:
-    https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/modeling_esmfold.py
-    The change is:
-    - rewrite (softmax().unsqueeze() @ esm_s).squeeze() with equivalent but less dims algorithm on HPU.
-
-    """
-    cfg = self.config.esmfold_config
-
-    aa = input_ids  # B x L
-    B = aa.shape[0]
-    L = aa.shape[1]
-    device = input_ids.device
-    if attention_mask is None:
-        attention_mask = torch.ones_like(aa, device=device)
-    if position_ids is None:
-        position_ids = torch.arange(L, device=device).expand_as(input_ids)
-
-    # === ESM ===
-    esmaa = self.af2_idx_to_esm_idx(aa, attention_mask)
-
-    if masking_pattern is not None:
-        masked_aa, esmaa, mlm_targets = self.bert_mask(aa, esmaa, attention_mask, masking_pattern)
-    else:
-        masked_aa = aa
-        mlm_targets = None
-
-    # We get sequence and pair representations from whatever version of ESM /
-    # configuration we are using. The sequence representation esm_s is always
-    # present. The pair embedding esm_z may be present depending on the
-    # configuration of the model. If esm_z is not used by the model then it
-    # is returned as None here.
-    esm_s = self.compute_language_model_representations(esmaa)
-
-    # Convert esm_s and esm_z, if present, to the precision used by the trunk and
-    # the structure module. These tensors may be a lower precision if, for example,
-    # we're running the language model in fp16 precision.
-    esm_s = esm_s.to(self.esm_s_combine.dtype)
-
-    if cfg.esm_ablate_sequence:
-        esm_s = esm_s * 0
-
-    esm_s = esm_s.detach()
-
-    # === preprocessing ===
-    if esm_s.device.type == "hpu":
-        dims = esm_s.shape
-        esm_s = esm_s.reshape(-1, dims[-2], dims[-1])  # combine first 2 dims
-        esm_s = self.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s
-        esm_s = esm_s.reshape(dims[0], dims[1], esm_s.shape[-2], esm_s.shape[-1])  # split back 1st dim
-        esm_s = esm_s.squeeze(2)
-    else:
-        esm_s = (self.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s).squeeze(2)
-    s_s_0 = self.esm_s_mlp(esm_s)
-
-    s_z_0 = s_s_0.new_zeros(B, L, L, cfg.trunk.pairwise_state_dim)
-
-    if self.config.esmfold_config.embed_aa:
-        s_s_0 += self.embedding(masked_aa)
-
-    structure: dict = self.trunk(s_s_0, s_z_0, aa, position_ids, attention_mask, no_recycles=num_recycles)
-    # Documenting what we expect:
-    structure = {
-        k: v
-        for k, v in structure.items()
-        if k
-        in [
-            "s_z",
-            "s_s",
-            "frames",
-            "sidechain_frames",
-            "unnormalized_angles",
-            "angles",
-            "positions",
-            "states",
-        ]
-    }
-
-    # Add BERT mask for the loss to use, if available.
-    if mlm_targets:
-        structure["mlm_targets"] = mlm_targets
-
-    disto_logits = self.distogram_head(structure["s_z"])
-    disto_logits = (disto_logits + disto_logits.transpose(1, 2)) / 2
-    structure["distogram_logits"] = disto_logits
-
-    lm_logits = self.lm_head(structure["s_s"])
-    structure["lm_logits"] = lm_logits
-
-    structure["aatype"] = aa
-    make_atom14_masks(structure)
-    # Of course, this doesn't respect the true mask because it doesn't know about it...
-    # We're not going to properly mask change of index tensors:
-    #    "residx_atom14_to_atom37",
-    #    "residx_atom37_to_atom14",
-    for k in [
-        "atom14_atom_exists",
-        "atom37_atom_exists",
-    ]:
-        structure[k] *= attention_mask.unsqueeze(-1)
-    structure["residue_index"] = position_ids
-
-    lddt_head = self.lddt_head(structure["states"]).reshape(structure["states"].shape[0], B, L, -1, self.lddt_bins)
-    structure["lddt_head"] = lddt_head
-    plddt = categorical_lddt(lddt_head[-1], bins=self.lddt_bins)
-    structure["plddt"] = plddt
-
-    ptm_logits = self.ptm_head(structure["s_z"])
-    structure["ptm_logits"] = ptm_logits
-    structure["ptm"] = compute_tm(ptm_logits, max_bin=31, no_bins=self.distogram_bins)
-    structure.update(compute_predicted_aligned_error(ptm_logits, max_bin=31, no_bins=self.distogram_bins))
-
-    return EsmForProteinFoldingOutput(**structure)
-
-
-def gaudi_rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
-    """
-    Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting.
-
-    Args:
-        r: [*, 3, 3] rotation matrices
-        t: [*, 3] coordinate tensors
-    Returns:
-        [*, 3] rotated coordinates
-
-    Copied from rot_vec_mul:
-    https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/openfold_utils/rigid_utils.py
-    The change is:
-    - Using matmul when possible on HPU to get better performance.
-    """
-    # Do matmal on HPU directly when possible to get better performance.
-    if r.device.type == "hpu":
-        if t.dim() > 5:
-            pass
-        elif t.dim() == 5:
-            # Combine shape[2] and shape[3] on HPU
-            shape_t = t.shape
-            shape_r = r.shape
-            t = t.reshape(shape_t[0], shape_t[1], shape_t[2] * shape_t[3], shape_t[4])
-            r = r.reshape(shape_r[0], shape_r[1], shape_r[2] * shape_r[3], shape_r[4], shape_r[5])
-            t = t.unsqueeze(-2)
-            r = r.transpose(-2, -1)
-            out = t @ r
-            shape_out = out.shape
-            out = out.reshape(
-                shape_out[0],
-                shape_out[1],
-                max(shape_r[2], shape_t[2]),
-                max(shape_r[3], shape_t[3]),
-                shape_out[3],
-                shape_out[4],
-            )
-            out = out.squeeze(-2)
-            return out
-        else:
-            t = t.unsqueeze(-2)
-            r = r.transpose(-2, -1)
-            out = t @ r
-            out = out.squeeze(-2)
-            return out
-
-    x, y, z = torch.unbind(t, dim=-1)
-    return torch.stack(
-        [
-            r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z,
-            r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z,
-            r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z,
-        ],
-        dim=-1,
-    )
-
-
-def gaudi_rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-    """
-    Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting.
-
-    Args:
-        a: [*, 3, 3] left multiplicand
-        b: [*, 3, 3] right multiplicand
-    Returns:
-        The product ab
-
-    Copied from rot_matmul:
-    https://github.com/huggingface/transformers/blob/main/src/transformers/models/esm/openfold_utils/rigid_utils.py
-    The change is:
-    - Using matmul when possible on HPU to get better performance.
-    """
-
-    # Do matmal on HPU directly when possible to get better performance.
-    if a.device.type == "hpu":
-        if a.shape == b.shape or a.dim() < 5:
-            out = a @ b
-            return out
-        elif a.dim() == 5 and a.shape[2] == 1:
-            # HPU does not handle dim==5 with below broadcast correctly.
-            # a.shape = torch.Size([1, 512, 1, 3, 3]), b.shape = torch.Size([1, 512, 8, 3, 3])
-            a = a.permute(0, 1, 2, 4, 3)
-            b = b.permute(0, 1, 2, 4, 3)
-            out = b @ a
-            out = out.permute(0, 1, 2, 4, 3)
-            return out
-        else:
-            pass
-
-    def row_mul(i: int) -> torch.Tensor:
-        return torch.stack(
-            [
-                a[..., i, 0] * b[..., 0, 0] + a[..., i, 1] * b[..., 1, 0] + a[..., i, 2] * b[..., 2, 0],
-                a[..., i, 0] * b[..., 0, 1] + a[..., i, 1] * b[..., 1, 1] + a[..., i, 2] * b[..., 2, 1],
-                a[..., i, 0] * b[..., 0, 2] + a[..., i, 1] * b[..., 1, 2] + a[..., i, 2] * b[..., 2, 2],
-            ],
-            dim=-1,
-        )
-
-    return torch.stack(
-        [
-            row_mul(0),
-            row_mul(1),
-            row_mul(2),
-        ],
-        dim=-2,
-    )
diff --git a/optimum/habana/transformers/models/falcon/__init__.py b/optimum/habana/transformers/models/falcon/__init__.py
deleted file mode 100644
index 0b15ff5617..0000000000
--- a/optimum/habana/transformers/models/falcon/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .modeling_falcon import (
-    GaudiFalconAttention,
-    GaudiFalconDecoderLayer,
-    GaudiFalconForCausalLM,
-    GaudiFalconMLP,
-    GaudiFalconModel,
-    gaudi_falcon_linear_forward,
-)
diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py
deleted file mode 100644
index 190a94e7a5..0000000000
--- a/optimum/habana/transformers/models/falcon/modeling_falcon.py
+++ /dev/null
@@ -1,1122 +0,0 @@
-import contextlib
-import math
-import os
-import warnings
-from typing import Optional, Tuple, Union
-
-import torch
-
-
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-except ImportError:
-    print("Not using HPU fused kernel for scaled_dot_product_attention")
-    FusedSDPA = None
-
-try:
-    from habana_frameworks.torch.hpu import sdp_kernel
-
-    SDPContext = True
-except ImportError:
-    SDPContext = False
-
-try:
-    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
-except ImportError:
-    print("Not using HPU fused kernel for apply_rotary_pos_emb")
-    FusedRoPE = None
-
-
-import habana_frameworks.torch.core as htcore
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from torch.nn import functional as F
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-)
-from transformers.models.falcon.configuration_falcon import FalconConfig
-from transformers.models.falcon.modeling_falcon import (
-    FalconAttention,
-    FalconDecoderLayer,
-    FalconForCausalLM,
-    FalconMLP,
-    FalconModel,
-    apply_rotary_pos_emb,
-    build_alibi_tensor,
-)
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    GaudiAttentionMaskConverter,
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
-    """
-    Copied from transformers.models.falcon.modeling_falcon/dropout_add
-    https://github.com/huggingface/transformers/blob/b338a6c3b8eda29610d4d472cad8cd87cbfdaaed/src/transformers/models/falcon/modeling_falcon.py#L248
-    """
-    out = F.dropout(x, p=prob, training=training)
-    if training:
-        out = residual + out
-        return out
-    else:
-        residual.add_(out)
-        return residual
-
-
-def apply_customized_rope(q, k, cos, sin, position_ids):
-    if q.device.type == "hpu" and FusedRoPE:
-        # TODO: remove `.clone()` when it is fixed in SynapseAI
-        return FusedRoPE.apply(
-            q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-        ), FusedRoPE.apply(
-            k, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-        )
-    else:
-        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
-
-
-def gaudi_falcon_linear_forward(self, input: torch.Tensor) -> torch.Tensor:
-    hidden_states = F.linear(input, self.weight, bias=self.bias)
-    return hidden_states
-
-
-#  FusedScaledDotProductAttention
-class ModuleFusedSDPA(torch.nn.Module):
-    def __init__(self, fusedSDPA):
-        super().__init__()
-        self._hpu_kernel_fsdpa = fusedSDPA
-
-    def forward(self, query, key, value, attn_mask, dropout_p, is_casual, scale, softmax_mode):
-        return self._hpu_kernel_fsdpa.apply(query, key, value, attn_mask, dropout_p, is_casual, scale, softmax_mode)
-
-
-class Softmax(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, dim=None, invAttnHead=None):
-        return torch.ops.hpu.softmax_fp8(x, dim, None, None, invAttnHead)
-
-
-class Matmul(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, *args, **kwargs):
-        return torch.matmul(*args, **kwargs)
-
-
-# ScaledDotProductAttention is based on torch.nn.functional.scaled_dot_product_attention
-class ScaledDotProductAttention(nn.Module):
-    def __init__(self, config: FalconConfig):
-        super().__init__()
-        self.head_dim = config.hidden_size // config.num_attention_heads
-        self.bmm1 = Matmul()
-        self.bmm2 = Matmul()
-        self.softmax = Softmax()
-        self.num_key_value_groups = config.num_attention_heads // config.num_kv_heads
-
-    def repeat_kv(
-        self,
-        query_states: torch.Tensor,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        n_rep: int,
-    ):
-        """
-        Copied from repeat_kv: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-        The only differences are:
-            - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
-            - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-        The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-        The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
-        """
-        batch, num_key_value_heads, kv_len, head_dim = key_states.shape
-        if n_rep == 1 or num_key_value_heads == 1:
-            return query_states, key_states, value_states, attention_mask
-
-        new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
-        key_states = key_states.reshape(new_kv_shape)
-        value_states = value_states.reshape(new_kv_shape)
-
-        batch, _, q_len, head_dim = query_states.shape
-        new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
-        query_states = query_states.reshape(new_q_shape)
-
-        if attention_mask is not None:
-            # Add groups dim and set to 1
-            attention_mask = attention_mask.unsqueeze(1)
-
-        return query_states, key_states, value_states, attention_mask
-
-    def forward(self, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
-        L, S = query.size(-2), key.size(-2)
-        scale_factor = 1 / math.sqrt(self.head_dim)
-        invAttnHead = torch.tensor(scale_factor, dtype=torch.float32).to("hpu")
-
-        if is_causal:
-            assert attn_mask is None
-            attn_bias = torch.zeros(L, S, dtype=query.dtype)
-            temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
-            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-            attn_bias.to(query.dtype)
-
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
-
-        query, key, value, attn_mask = self.repeat_kv(query, key, value, attn_mask, self.num_key_value_groups)
-
-        attn_weight = self.bmm1(query, key.transpose(-2, -1))
-        attn_weight += attn_mask
-        attn_weight = self.softmax(attn_weight, dim=-1, invAttnHead=invAttnHead)
-        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
-        attn_output = self.bmm2(attn_weight, value)
-        return attn_output
-
-
-def update(prev, cur, dim, idx, inp_seq_len):
-    orig_cur = cur
-    cur = cur.to(dtype=prev.dtype)
-
-    if prev.shape == cur.shape:
-        prev.copy_(cur)
-        return orig_cur
-
-    if cur.shape[-2] > 1 and cur.shape[-2] <= prev.shape[-2]:
-        # Initialize
-        prev[:, :, :inp_seq_len, :].copy_(cur)
-        return orig_cur
-    assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
-    if idx is not None:
-        prev.index_copy_(dim, idx - 1, cur)
-        prev_cast = prev.to(orig_cur.dtype)
-        return prev_cast
-    else:
-        return torch.cat((prev, cur), dim=dim)
-
-
-class KVCache(torch.nn.Module):
-    def __init__(self):
-        super(KVCache, self).__init__()
-        self.cache = None
-        self.inp_seq_len = -1
-
-    def allocate(self, inp_seq_len, dtype, device, shape):
-        if self.cache is None or self.cache.shape != shape:
-            self.inp_seq_len = inp_seq_len
-            self.cache = torch.zeros(shape, dtype=dtype, device=device)
-        else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
-            self.cache.fill_(0)
-
-    def get_shape(self):
-        if self.cache is None:
-            return None
-        return self.cache.shape
-
-    def forward(self, cur, dim, idx):
-        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
-
-    @staticmethod
-    def update(prev, cur, dim, idx, inp_seq_len):
-        return update(prev, cur, dim, idx, inp_seq_len)
-
-
-class GaudiFalconAttention(FalconAttention):
-    """
-    Inherits from FalconAttention: https://github.com/huggingface/transformers/blob/838b87abe231fd70be5132088d0dee72a7bb8d62/src/transformers/models/falcon/modeling_falcon.py#L267
-    The only differences are:
-    - add new args token_idx and position_ids
-    - replace F.scaled_dot_product_attention with Habana torch's version for BF16
-    - use ScaledDotProductAttention for FP8 quantization
-    - add new arg reuse_cache
-    - add new args use_flash_attention
-    - add new arg flash_attention_recompute
-    - add new arg flash_attention_causal_mask
-    Choice of SDPA:
-        There are these variables: use_flash_attention and datatype (bf16/fp8)
-        datatype is determined by presence of QUANT_CONFIG env var, presence of which indicates fp8
-        1. use_flash_attention, fp8: use ModuleFusedSDPA. most optimal
-        2. use_flash_attention, bf16: use FusedSDPA
-        3. not use_flash_attention, fp8: Use ScaledDotProductAttention, along with QUANT_CONFIG. This is the case before this PR
-        4. not use_flash_attention, bf16: F.scaled_dot_product_attention. Slowest option
-    """
-
-    def __init__(self, config: FalconConfig):
-        super().__init__(config)
-
-        self.is_fp8 = os.getenv("QUANT_CONFIG", "") != ""
-
-        # In the constructor we do not know which one we will need later in the forward, so creating both
-        # TODO, Does this affect memory usage?
-        if self.is_fp8:
-            self.fused_scaled_dot_product_attention = ModuleFusedSDPA(FusedSDPA)
-        self.unfused_scaled_dot_product_attention = ScaledDotProductAttention(config)
-
-        self.k_cache = KVCache()
-        self.v_cache = KVCache()
-        self.inp_seq_len = -1
-        self.max_position_embeddings = config.max_position_embeddings
-
-    def _split_heads(
-        self, fused_qkv: torch.Tensor, broadcast: Optional[bool] = True
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        if self.new_decoder_architecture:
-            batch, seq_len, _ = fused_qkv.shape
-
-            if self.config.num_attention_heads != self.num_heads:  # When DS divides heads for TP
-                num_heads = self.config.num_attention_heads
-                num_kv_heads = self.config.num_kv_heads
-            else:  # When DS not in use
-                num_heads = self.num_heads
-                num_kv_heads = self.num_kv_heads
-
-            qkv = fused_qkv.view(batch, seq_len, -1, num_heads // num_kv_heads + 2, self.head_dim)
-            # query = qkv[:, :, :, :-2]
-            # key = qkv[:, :, :, [-2]]
-            # value = qkv[:, :, :, [-1]]
-            d3 = qkv.shape[3] - 2
-            query = torch.index_select(qkv, 3, index=torch.arange(d3, device=qkv.device))
-            key = torch.index_select(qkv, 3, index=torch.tensor([d3], device=qkv.device))
-            value = torch.index_select(qkv, 3, index=torch.tensor([d3 + 1], device=qkv.device))
-            if broadcast:
-                key = torch.broadcast_to(key, query.shape)
-                value = torch.broadcast_to(value, query.shape)
-
-            query, key, value = [x.flatten(2, 3) for x in (query, key, value)]
-            return query, key, value
-        elif not self.multi_query:
-            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
-            # TODO : Need to be fixed to use index_select()
-            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
-        else:
-            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
-            # return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
-            d2 = fused_qkv.shape[2] - 2
-            query = torch.index_select(fused_qkv, 2, index=torch.arange(d2, device=fused_qkv.device))
-            key = torch.index_select(fused_qkv, 2, index=torch.tensor([d2], device=fused_qkv.device))
-            value = torch.index_select(fused_qkv, 2, index=torch.tensor([d2 + 1], device=fused_qkv.device))
-            return query, key, value
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        if self.config.new_decoder_architecture:
-            cache_shape = (batch_size, self.num_kv_heads, max_seq_len, self.head_dim)
-        else:
-            cache_shape = (batch_size, 1, max_seq_len, self.head_dim)
-        device = self.query_key_value.weight.device
-        dtype = self.config.torch_dtype
-        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-
-    def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
-        # This helps in avoiding creation of these caches during actual model forward pass and
-        # reduce memory consumption and improve performance.
-        if seq_len > self.max_position_embeddings:
-            self.max_position_embeddings = seq_len
-            self.rotary_emb._set_cos_sin_cache(
-                seq_len, self.query_key_value.weight.device, self.query_key_value.weight.dtype
-            )
-
-    def pre_attn_forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: Optional[torch.Tensor],
-        attention_mask: torch.Tensor,
-        position_ids: Optional[torch.LongTensor] = None,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: int = None,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-        # 3 x [batch_size, seq_length, num_heads, head_dim]
-
-        train_with_flash_attention = self.training and self._use_sdpa and not output_attentions and head_mask is None
-        (query_layer, key_layer, value_layer) = self._split_heads(
-            fused_qkv, not use_flash_attention and not self.is_fp8 and not train_with_flash_attention
-        )
-
-        batch_size, query_length, _, _ = query_layer.shape
-
-        query_layer = query_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
-        key_layer = key_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
-        value_layer = value_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
-
-        kv_seq_len = key_layer.shape[-2]
-        if layer_past is not None:
-            if token_idx is not None:
-                if reuse_cache:
-                    kv_seq_len = layer_past[0][-2]
-                else:
-                    kv_seq_len = layer_past[0].shape[-2]
-            else:
-                kv_seq_len += layer_past[0].shape[-2]
-
-        if alibi is None:
-            cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len)
-            query_layer, key_layer = apply_customized_rope(query_layer, key_layer, cos, sin, position_ids)
-
-        if use_cache:
-            if self.training:
-                present = None
-            else:
-                if reuse_cache:
-                    key_layer = self.k_cache(key_layer, -2, token_idx)
-                    value_layer = self.v_cache(value_layer, -2, token_idx)
-                    present = (self.k_cache.get_shape(), self.v_cache.get_shape())
-                else:
-                    if layer_past is None:
-                        past_key = torch.zeros(
-                            key_layer.shape,
-                            dtype=self.query_key_value.weight.dtype,
-                            device=self.query_key_value.weight.device,
-                        )
-                        past_value = torch.zeros(
-                            key_layer.shape,
-                            dtype=self.query_key_value.weight.dtype,
-                            device=self.query_key_value.weight.device,
-                        )
-                        layer_past = [past_key, past_value]
-                    key_layer = self.k_cache.update(
-                        layer_past[0], key_layer, -2, token_idx, self.inp_seq_len
-                    )  # k_layer bs*1, q_len, head_dim
-                    value_layer = self.v_cache.update(layer_past[1], value_layer, -2, token_idx, self.inp_seq_len)
-                    if token_idx is None:
-                        layer_past = (key_layer, value_layer)
-                    present = layer_past
-
-                if cache_idx is not None and query_length == 1:
-                    key_layer = key_layer[:, :, :cache_idx, :]
-                    value_layer = value_layer[:, :, :cache_idx, :]
-                    attention_mask = attention_mask[:, :, :, :cache_idx]
-        else:
-            present = None
-
-        if self.training or present is None:
-            kv_length = key_layer.shape[-2]
-        else:
-            kv_length = present[0][-2] if reuse_cache else present[0].shape[-2]
-
-        if (not reuse_cache) and (token_idx is not None) and (cache_idx is not None) and (query_length == 1):
-            # Return only past key value shapes and not the tensors during decode phase (q len is 1)
-            # to avoid making past key values as persistent output tensors of HPU graphs.
-            present = (present[0].shape, present[1].shape)
-
-        if alibi is None:  # both train/inference
-            if output_attentions:
-                attention_scores = query_layer @ key_layer.transpose(-1, -2)
-                attention_scores /= math.sqrt(self.head_dim)
-
-                attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype)
-                # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi).
-                attn_output = attention_scores @ value_layer
-            else:
-                if use_flash_attention or train_with_flash_attention:
-                    is_causal = self.is_causal and query_length > 1 and flash_attention_causal_mask
-                    if self.is_fp8:
-                        attn_mask = None if is_causal else attention_mask
-                        flash_attention_fast_softmax = True  # TODO pass this along
-                        softmax_mode = "fast" if flash_attention_fast_softmax else "None"
-                        enable_recompute = self.is_fp8 if query_length == 1 else flash_attention_recompute
-                        with sdp_kernel(enable_recompute=enable_recompute):
-                            attn_output = self.fused_scaled_dot_product_attention(
-                                query_layer, key_layer, value_layer, attn_mask, 0.0, is_causal, None, softmax_mode
-                            )
-                    else:
-                        # TODO very similar to the fp8 case above, could be merged.
-                        with sdp_kernel(
-                            enable_recompute=flash_attention_recompute
-                        ) if SDPContext else contextlib.nullcontext():
-                            attn_output = FusedSDPA.apply(
-                                query_layer,
-                                key_layer,
-                                value_layer,
-                                attention_mask,
-                                0.0,
-                                # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
-                                is_causal and attention_mask is None,
-                            )
-                else:
-                    if self.is_fp8:
-                        attn_output = self.unfused_scaled_dot_product_attention(
-                            query_layer, key_layer, value_layer, attention_mask, 0.0, is_causal=False
-                        )
-                    else:
-                        # Workaround util scaled_dot_product_attention support broadcast.
-                        if self.training is True and query_layer.shape != key_layer.shape:
-                            key_layer = torch.broadcast_to(key_layer, query_layer.shape)
-                            value_layer = torch.broadcast_to(value_layer, query_layer.shape)
-                        attn_output = F.scaled_dot_product_attention(
-                            query_layer,
-                            key_layer,
-                            value_layer,
-                            attention_mask,
-                            0.0,
-                            # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
-                            is_causal=self.is_causal and attention_mask is None and query_length > 1,
-                        )
-
-                # Performance improvement for HPU
-                if self.training is True and htcore:
-                    htcore.mark_step()
-                attention_scores = None
-
-            attn_output = attn_output.view(batch_size, -1, query_length, self.head_dim)
-            attn_output = attn_output.permute(0, 2, 1, 3)
-            attn_output = attn_output.reshape(batch_size, query_length, -1)
-
-            attn_output = self.dense(attn_output)
-
-            if output_attentions:
-                return attn_output, present, attention_scores
-            else:
-                return attn_output, present, _
-
-        else:
-            if train_with_flash_attention:
-                if FusedSDPA:
-                    # TODO needs to be turned into a module for quantization
-                    with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext():
-                        attn_output = FusedSDPA.apply(
-                            query_layer,
-                            key_layer,
-                            value_layer,
-                            attention_mask,
-                            self.attention_dropout.p if self.training else 0.0,
-                            self.is_causal and attention_mask is None and query_length > 1,
-                        )
-                else:
-                    attn_output = F.scaled_dot_product_attention(
-                        query_layer,
-                        key_layer,
-                        value_layer,
-                        attn_mask=attention_mask,
-                        dropout_p=self.attention_dropout.p if self.training else 0.0,
-                        is_causal=self.is_causal and attention_mask is None and query_length > 1,
-                    )
-                attn_output = attn_output.transpose(1, 2)
-                attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
-
-                attn_output = self.dense(attn_output)
-            else:
-                matmul_result = query_layer @ key_layer.transpose(-1, -2)
-
-                # change view to [batch_size, num_heads, q_length, kv_length]
-                attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
-
-                # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-                input_dtype = attention_scores.dtype
-                # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
-                if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
-                    attention_scores = attention_scores.to(torch.float32)
-
-                attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
-                attention_logits *= self.inv_norm_factor
-                attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
-                # [batch_size, num_heads, q_length, kv_length]
-                attention_probs = self.attention_dropout(attention_probs)
-
-                if head_mask is not None:
-                    attention_probs = attention_probs * head_mask
-
-                # change view [batch_size, num_heads, q_length, kv_length]
-                attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
-
-                # matmul: [batch_size * num_heads, q_length, head_dim]
-                attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1)
-
-                # change view [batch_size, q_length, num_heads * head_dim]
-                attn_output = self._merge_heads(attn_output)
-
-                attn_output = self.dense(attn_output)
-
-            if output_attentions:
-                return attn_output, present, attention_probs
-            else:
-                return attn_output, present, _
-
-    def attention_all_reduce(self, attn_output):
-        if hasattr(self.dense, "all_reduce"):
-            self.dense.all_reduce(attn_output)
-
-    def post_attn_forward(self, attn_output):
-        if hasattr(self.dense, "all_reduce"):
-            self.dense.post_all_reduce(attn_output)
-        return attn_output
-
-
-class GaudiFalconMLP(FalconMLP):
-    """
-    Inherits from FalconMLP: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
-    """
-
-    def pre_mlp_forward(self, x):
-        x = self.act(self.dense_h_to_4h(x))
-        x = self.dense_4h_to_h(x)
-        return x
-
-    def mlp_all_reduce(self, x):
-        if hasattr(self.dense_4h_to_h, "all_reduce"):
-            self.dense_4h_to_h.all_reduce(x)
-
-    def post_mlp_forward(self, x):
-        if hasattr(self.dense_4h_to_h, "all_reduce"):
-            self.dense_4h_to_h.post_all_reduce(x)
-        return x
-
-
-class GaudiFalconDecoderLayer(FalconDecoderLayer):
-    """
-    Inherits from FalconDecoderLayer: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
-    The only differences are:
-    - add new args token_idx and position_ids
-    - add token_idx and position_ids into attention inputs
-    - add new args reuse_cache
-    - add new args use_flash_attention
-    - add new arg flash_attention_recompute
-    - add new arg flash_attention_causal_mask
-    """
-
-    def __init__(self, config: FalconConfig):
-        super().__init__(config)
-        self.self_attention = GaudiFalconAttention(config)
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.self_attention.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def update_sincos_cache(self, seq_len):
-        self.self_attention.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: Optional[torch.Tensor],
-        attention_mask: torch.Tensor,
-        position_ids: Optional[torch.LongTensor] = None,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: int = None,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        residual = hidden_states
-        (
-            hidden_states,
-            present,
-            attn_scores,
-            attention_layernorm_out,
-            mlp_layernorm_out,
-        ) = self.pre_attn(  # layernorm + attention before AllReduce
-            hidden_states,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            alibi=alibi,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            token_idx=token_idx,
-            reuse_cache=reuse_cache,
-            cache_idx=cache_idx,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-            **kwargs,
-        )
-
-        self.self_attention.attention_all_reduce(hidden_states)
-        hidden_states = self.self_attention.post_attn_forward(hidden_states)
-
-        attention_output = hidden_states
-
-        if not self.config.new_decoder_architecture:
-            if self.config.parallel_attn:
-                mlp_layernorm_out = attention_layernorm_out
-            else:
-                residual = dropout_add(
-                    attention_output, residual, self.config.attention_dropout, training=self.training
-                )
-                mlp_layernorm_out = self.post_attention_layernorm(residual)
-
-        outputs = (present, attn_scores)
-
-        hidden_states = self.mlp.pre_mlp_forward(mlp_layernorm_out)
-        self.mlp.mlp_all_reduce(hidden_states)
-        hidden_states = self.mlp.post_mlp_forward(hidden_states)
-
-        if self.config.new_decoder_architecture or self.config.parallel_attn:
-            hidden_states += attention_output
-
-        output = dropout_add(hidden_states, residual, self.config.hidden_dropout, training=self.training)
-
-        if use_cache:
-            outputs = (output,) + outputs
-        else:
-            outputs = (output,) + outputs[1:]
-
-        return outputs  # hidden_states, present, attentions
-
-    def pre_attn(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: Optional[torch.Tensor],
-        attention_mask: torch.Tensor,
-        position_ids: Optional[torch.LongTensor] = None,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: int = None,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-    ):
-        if self.config.new_decoder_architecture:
-            attention_layernorm_out = self.ln_attn(hidden_states)
-            mlp_layernorm_out = self.ln_mlp(hidden_states)
-        else:
-            attention_layernorm_out = self.input_layernorm(hidden_states)
-            mlp_layernorm_out = None
-
-        # Self attention.
-        attn_outputs, present, attn_scores = self.self_attention.pre_attn_forward(
-            attention_layernorm_out,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            alibi=alibi,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            token_idx=token_idx,
-            reuse_cache=reuse_cache,
-            cache_idx=cache_idx,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-        )
-
-        return attn_outputs, present, attn_scores, attention_layernorm_out, mlp_layernorm_out
-
-
-class GaudiFalconModel(FalconModel):
-    """
-    Inherits from FalconModel: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
-    The only differences are:
-    - add new args token_idx and position_ids
-    - add token_idx and position_ids into decoder inputs
-    - add new arg reuse_cache
-    - add new args use_flash_attention
-    - add new arg flash_attention_recompute
-    - add new arg flash_attention_causal_mask
-    """
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        for layer in self.h:
-            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def update_sincos_cache(self, seq_len):
-        for layer in self.h:
-            layer.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: int = None,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        past_key_values_length = 0
-        if past_key_values[0] is not None and token_idx is None:
-            if reuse_cache:
-                past_key_values_length = past_key_values[0][0][-2]
-            else:
-                past_key_values_length = past_key_values[0][0].shape[-2]
-
-        if self.use_alibi:
-            mask = (
-                torch.ones(
-                    (batch_size, seq_length + past_key_values_length), device=inputs_embeds.device, dtype=torch.long
-                )
-                if attention_mask is None
-                else attention_mask
-            )
-            alibi = build_alibi_tensor(mask, self.num_heads, dtype=hidden_states.dtype)
-        else:
-            alibi = None
-            if position_ids is None:
-                device = input_ids.device if input_ids is not None else inputs_embeds.device
-                position_ids = torch.arange(
-                    past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-                )
-                position_ids = position_ids.unsqueeze(0)
-
-        # TODO: Due to perf degradation, disable spda_attn_mask
-        use_sdpa_attn_mask = False
-
-        if self._use_sdpa and not output_attentions and use_sdpa_attn_mask:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            if alibi is None:
-                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                    attention_mask,
-                    (batch_size, seq_length),
-                    inputs_embeds,
-                    past_key_values_length,
-                )
-            elif head_mask is None:
-                alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
-
-                # We don't call _prepare_4d_causal_attention_mask_for_sdpa as we need to mask alibi using the 4D attention_mask untouched.
-                attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-                    attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-                )
-
-                # We take care to integrate alibi bias in the attention_mask here.
-                min_dtype = torch.finfo(alibi.dtype).min
-                attention_mask = torch.masked_fill(
-                    alibi / math.sqrt(self.config.hidden_size // self.num_heads),
-                    attention_mask < -1,
-                    min_dtype,
-                )
-
-                # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
-                # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
-                if seq_length > 1:
-                    attention_mask = GaudiAttentionMaskConverter._unmask_unattended(
-                        attention_mask, min_dtype=min_dtype
-                    )
-            else:
-                # PyTorch SDPA does not support head_mask, we fall back on the eager implementation in this case.
-                attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-                    attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-                )
-
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-            )
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape batch_size x num_heads x N x N
-        # head_mask has shape n_layer x batch x num_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                outputs = self._gradient_checkpointing_func(
-                    block.__call__,
-                    hidden_states,
-                    alibi,
-                    attention_mask,
-                    position_ids,
-                    head_mask[i],
-                    layer_past,
-                    use_cache,
-                    output_attentions,
-                    None,
-                    use_flash_attention,
-                    flash_attention_recompute,
-                    flash_attention_causal_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                    token_idx=token_idx,
-                    reuse_cache=reuse_cache,
-                    cache_idx=cache_idx,
-                    use_flash_attention=use_flash_attention,
-                    flash_attention_recompute=flash_attention_recompute,
-                    flash_attention_causal_mask=flash_attention_causal_mask,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        # Add last hidden state
-        hidden_states = self.ln_f(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class GaudiFalconForCausalLM(FalconForCausalLM):
-    """
-    Inherits from FalconForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
-    The only differences are:
-    - add new args token_idx and position_ids
-    - add token_idx and position_ids into model inputs
-    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-    - add new args reuse_cache
-    - add use_flash_attention
-    - add flash_attention_recompute
-    - add flash_attention_causal_mask
-    """
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.transformer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-        self.kv_cache_len = max_seq_len
-
-    def update_sincos_cache(self, seq_len):
-        self.transformer.update_sincos_cache(seq_len)
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> dict:
-        reuse_cache = kwargs.get("reuse_cache")
-        bucket_internal = kwargs.get("bucket_internal")
-        if past_key_values is not None:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                past_length = past_key_values[0][0].shape[2]
-
-                # Some generation methods already pass only the last input ID
-                if input_ids.shape[1] > past_length:
-                    remove_prefix_length = past_length
-                else:
-                    # Default to old behavior: keep only final ID
-                    remove_prefix_length = input_ids.shape[1] - 1
-
-                input_ids = input_ids[:, remove_prefix_length:]
-        elif (reuse_cache or bucket_internal) and token_idx is not None:
-            # KV cache is pre allocated with reuse cache or will be padded with bucket internal
-            # hence for the 1st token we can slice the inputs till token idx for the fwd pass.
-            input_ids = input_ids[:, :token_idx]
-            attention_mask = attention_mask[:, :token_idx]
-
-        # Note: versions of Falcon with alibi do not use position_ids. It is used with RoPE.
-        if (
-            not self.transformer.use_alibi
-            and attention_mask is not None
-            and position_ids is None
-            and token_idx is not None
-        ):
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        return {
-            "input_ids": input_ids,
-            "position_ids": position_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-            "token_idx": token_idx,
-            "reuse_cache": reuse_cache,
-            "cache_idx": kwargs.get("cache_idx"),
-            "use_flash_attention": kwargs.get("use_flash_attention"),
-            "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
-            "flash_attention_causal_mask": kwargs.get("flash_attention_causal_mask"),
-        }
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        trim_logits: Optional[bool] = False,
-        cache_idx: int = None,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if use_flash_attention:
-            assert FusedSDPA, "`use_flash_attention` is True, but cannot find FusedSDPA. Please import it as `from habana_frameworks.torch.hpex.kernels import FusedSDPA` or set use_flash_attention to False (at the expense of a possible performance degradation)."
-        if flash_attention_recompute:
-            assert use_flash_attention, "flash_attention_recompute is set, but use_flash_attention is not"
-        if flash_attention_causal_mask:
-            assert use_flash_attention, "flash_attention_causal_mask is set, but use_flash_attention is not"
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-            reuse_cache=reuse_cache,
-            cache_idx=cache_idx,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-        )
-        hidden_states = transformer_outputs[0]
-
-        _, seq_len, _ = hidden_states.shape
-        if seq_len > 1 and trim_logits and not self.training:
-            if token_idx is not None:
-                hidden_states = hidden_states.index_select(1, token_idx - 1)
-            else:
-                hidden_states = hidden_states[:, -1:, :]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            batch_size, seq_length, vocab_size = shift_logits.shape
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/optimum/habana/transformers/models/gemma/__init__.py b/optimum/habana/transformers/models/gemma/__init__.py
deleted file mode 100644
index 63a2a2f31d..0000000000
--- a/optimum/habana/transformers/models/gemma/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_gemma import (
-    GaudiGemmaDecoderLayer,
-    GaudiGemmaForCausalLM,
-    gaudi_gemma_attention_forward,
-    gaudi_gemma_model_forward,
-)
diff --git a/optimum/habana/transformers/models/gemma/modeling_gemma.py b/optimum/habana/transformers/models/gemma/modeling_gemma.py
deleted file mode 100644
index 7f57e838b8..0000000000
--- a/optimum/habana/transformers/models/gemma/modeling_gemma.py
+++ /dev/null
@@ -1,489 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Gemma model."""
-
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.gemma.modeling_gemma import (
-    GemmaAttention,
-    GemmaConfig,
-    GemmaDecoderLayer,
-    GemmaForCausalLM,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_gemma_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Cache] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    cache_position: Optional[torch.LongTensor] = None,
-    token_idx: Optional[torch.Tensor] = None,
-    **kwargs,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """
-    Copied from GemmaAttention.forward: https://github.com/huggingface/transformers/blob/v4.38.1/src/transformers/models/gemma/modeling_gemma.py
-    The only differences are:
-    - add new args token_idx
-    - use attention_mask directly instead of using cache_position to compute causal_mask
-    """
-
-    bsz, q_len, _ = hidden_states.size()
-
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-    past_key_value = getattr(self, "past_key_value", past_key_value)
-
-    cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
-    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
-
-    if past_key_value is not None:
-        if token_idx is not None:
-            if 0 <= self.layer_idx < len(past_key_value.key_cache):
-                past_key_value.key_cache[self.layer_idx].index_copy_(2, token_idx - 1, key_states)
-                past_key_value.value_cache[self.layer_idx].index_copy_(2, token_idx - 1, value_states)
-                key_states = past_key_value.key_cache[self.layer_idx]
-                value_states = past_key_value.value_cache[self.layer_idx]
-            else:
-                past_key_value.key_cache.append(key_states)
-                past_key_value.value_cache.append(value_states)
-        else:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-    if attention_mask is not None:  # no matter the length, we just slice it
-        attn_weights = attn_weights + attention_mask
-
-    # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-
-    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    attn_output = attn_output.view(bsz, q_len, -1)
-    attn_output = self.o_proj(attn_output)
-
-    if not output_attentions:
-        attn_weights = None
-
-    return attn_output, attn_weights, past_key_value
-
-
-class GaudiGemmaDecoderLayer(GemmaDecoderLayer):
-    def __init__(self, config: GemmaConfig, layer_idx: int):
-        super().__init__(config, layer_idx)
-        self.self_attn = GemmaAttention(config, layer_idx)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Copied from GemmaDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.38.1/src/transformers/models/gemma/modeling_gemma.py
-        The only differences are:
-        - add new args token_idx
-        """
-
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            token_idx=token_idx,
-            **kwargs,
-        )
-
-        hidden_states = residual + hidden_states
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-def gaudi_gemma_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from GemmaModel.forward: https://github.com/huggingface/transformers/blob/v4.38.1/src/transformers/models/gemma/modeling_gemma.py
-    The only differences are:
-    - add new args token_idx
-    - replace _update_causal_mask with _gaudi_prepare_4d_causal_attention_mask
-    """
-
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError(
-            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-        )
-    elif input_ids is not None:
-        batch_size, seq_length = input_ids.shape[:2]
-    elif inputs_embeds is not None:
-        batch_size, seq_length = inputs_embeds.shape[:2]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    if self.gradient_checkpointing and self.training and use_cache:
-        logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.")
-        use_cache = False
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-
-    past_seen_tokens = 0
-    if use_cache:  # kept for BC (cache positions)
-        if not isinstance(past_key_values, StaticCache):
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        past_seen_tokens = past_key_values.get_seq_length()
-
-    if cache_position is None:
-        cache_position = torch.arange(
-            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-        )
-
-    if position_ids is None:
-        position_ids = cache_position.unsqueeze(0)
-
-    # 4d mask is passed through the layers, not use self._update_causal_mask
-    causal_mask = _gaudi_prepare_4d_causal_attention_mask(
-        attention_mask, (batch_size, seq_length), inputs_embeds, past_seen_tokens
-    )
-
-    # embed positions
-    hidden_states = inputs_embeds
-
-    # normalized
-    hidden_states = hidden_states * (self.config.hidden_size**0.5)
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = None
-
-    for decoder_layer in self.layers:
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                causal_mask,
-                position_ids,
-                past_key_values,
-                output_attentions,
-                use_cache,
-                cache_position,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=causal_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-                token_idx=token_idx,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = None
-    if use_cache:
-        next_cache = (
-            next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
-        )
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-class GaudiGemmaForCausalLM(GemmaForCausalLM):
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        """
-        Inherits from GemmaForCausalLM: https://github.com/huggingface/transformers/blob/v4.38.1/src/transformers/models/gemma/modeling_gemma.py
-        The only differences are:
-        - add new args token_idx
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            cache_position=cache_position,
-            token_idx=token_idx,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
-    ):
-        """
-        Inherits from GemmaForCausalLM: https://github.com/huggingface/transformers/blob/v4.38.1/src/transformers/models/gemma/modeling_gemma.py
-        The only differences are:
-        - add new args token_idx
-        - add token_idx into model_inputs
-        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-        """
-
-        token_idx = kwargs.get("token_idx", None)
-
-        past_length = 0
-        if past_key_values is not None:
-            if token_idx is None:
-                if isinstance(past_key_values, Cache):
-                    past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
-                    max_cache_length = (
-                        torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
-                        if past_key_values.get_max_length() is not None
-                        else None
-                    )
-                    cache_length = (
-                        past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
-                    )
-                # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-            else:
-                # past_length += token_idx
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        if token_idx is None:
-            if past_key_value := getattr(self.model.layers[0].self_attn, "past_key_value", None):
-                # generation with static cache
-                past_length = past_key_value.get_seq_length()
-                input_ids = input_ids[:, past_length:]
-                position_ids = position_ids[:, past_length:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids.contiguous()}
-
-        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
-        if cache_position is None:
-            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
-        else:
-            cache_position = cache_position[-input_length:]
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
diff --git a/optimum/habana/transformers/models/gpt2/__init__.py b/optimum/habana/transformers/models/gpt2/__init__.py
deleted file mode 100644
index 4052373a27..0000000000
--- a/optimum/habana/transformers/models/gpt2/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modeling_gpt2 import GaudiGPT2Attention, GaudiGPT2Block, GaudiGPT2LMHeadModel, gaudi_gpt2_forward
diff --git a/optimum/habana/transformers/models/gpt2/modeling_gpt2.py b/optimum/habana/transformers/models/gpt2/modeling_gpt2.py
deleted file mode 100644
index 3a2e85d24b..0000000000
--- a/optimum/habana/transformers/models/gpt2/modeling_gpt2.py
+++ /dev/null
@@ -1,588 +0,0 @@
-from typing import Optional, Tuple, Union
-
-import torch
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from transformers.models.gpt2.modeling_gpt2 import GPT2MLP, GPT2Attention, GPT2LMHeadModel, logger
-
-
-class GaudiGPT2Attention(GPT2Attention):
-    """
-    Copied from GPT2Attention: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
-    The only differences are:
-    - optimize KV cache
-    """
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        key = key.contiguous()
-        value = value.contiguous()
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
-        key = key.contiguous()
-        value = value.contiguous()
-        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        # Preallocate attn_weights for `baddbmm`
-        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
-
-        # Compute Scale Factor
-        scale_factor = 1.0
-        if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
-
-        if self.scale_attn_by_inverse_layer_idx:
-            scale_factor /= float(self.layer_idx + 1)
-
-        q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
-        attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
-        attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
-        if attn_weights.dtype != torch.float32:
-            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        if encoder_hidden_states is not None:
-            if not hasattr(self, "q_attn"):
-                raise ValueError(
-                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
-                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
-                )
-
-            query = self.q_attn(hidden_states)
-            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
-            attention_mask = encoder_attention_mask
-        else:
-            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim).contiguous()
-        key = self._split_heads(key, self.num_heads, self.head_dim).contiguous()
-        value = self._split_heads(value, self.num_heads, self.head_dim).contiguous()
-
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            if token_idx is not None:
-                past_key.index_copy_(2, token_idx - 1, key)
-                past_value.index_copy_(2, token_idx - 1, value)
-                key = past_key
-                value = past_value
-            else:
-                key = torch.cat((past_key, key), dim=-2)
-                value = torch.cat((past_value, value), dim=-2)
-
-        if use_cache is True:
-            present = (key, value)
-        else:
-            present = None
-
-        if self.reorder_and_upcast_attn:
-            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
-        else:
-            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs  # a, present, (attentions)
-
-
-class GaudiGPT2Block(torch.nn.Module):
-    def __init__(self, config, layer_idx=None):
-        super().__init__()
-        hidden_size = config.hidden_size
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
-        attention_class = GaudiGPT2Attention
-
-        self.ln_1 = torch.nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = attention_class(config=config, layer_idx=layer_idx)
-        self.ln_2 = torch.nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        if config.add_cross_attention:
-            self.crossattention = attention_class(config=config, is_cross_attention=True, layer_idx=layer_idx)
-            self.ln_cross_attn = torch.nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = GPT2MLP(inner_dim, config)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        """
-        Copied from GPT2Block.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
-        The only differences are:
-        - add new args token_idx
-        """
-
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-
-        attn_outputs = self.attn(
-            hidden_states,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            token_idx=token_idx,
-        )
-        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
-        outputs = attn_outputs[1:]
-        # residual connection
-        hidden_states = attn_output + residual
-
-        if encoder_hidden_states is not None:
-            # add one self-attention block for cross-attention
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
-                    "cross-attention layers by setting `config.add_cross_attention=True`"
-                )
-            residual = hidden_states
-            hidden_states = self.ln_cross_attn(hidden_states)
-            cross_attn_outputs = self.crossattention(
-                hidden_states,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                output_attentions=output_attentions,
-            )
-            attn_output = cross_attn_outputs[0]
-            # residual connection
-            hidden_states = residual + attn_output
-            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
-
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + feed_forward_hidden_states
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs  # hidden_states, present, (attentions, cross_attentions)
-
-
-def gaudi_gpt2_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    token_type_ids: Optional[torch.LongTensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-    """
-    Copied from GPT2Model.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
-    The only differences are:
-    - disable autocast for attention_mask
-    - add new args token_idx
-    """
-
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-        batch_size = input_ids.shape[0]
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-        batch_size = inputs_embeds.shape[0]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-    if token_type_ids is not None:
-        token_type_ids = token_type_ids.view(-1, input_shape[-1])
-
-    if past_key_values is None:
-        past_length = 0
-        past_key_values = tuple([None] * len(self.h))
-    else:
-        past_length = past_key_values[0][0].size(-2)
-    if position_ids is None:
-        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-        position_ids = position_ids.unsqueeze(0)
-
-    # GPT2Attention mask.
-    if attention_mask is not None:
-        attention_mask = attention_mask.view(batch_size, -1)
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        attention_mask = attention_mask[:, None, None, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and the dtype's smallest value for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-
-        with torch.autocast(enabled=False, device_type="hpu"):
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-    # If a 2D or 3D attention mask is provided for the cross-attention
-    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-    if self.config.add_cross_attention and encoder_hidden_states is not None:
-        encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-        if encoder_attention_mask is None:
-            encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-        encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-    else:
-        encoder_attention_mask = None
-
-    # Prepare head mask if needed
-    # 1.0 in head_mask indicate we keep the head
-    # attention_probs has shape bsz x n_heads x N x N
-    # head_mask has shape n_layer x batch x n_heads x N x N
-    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.wte(input_ids)
-    position_embeds = self.wpe(position_ids)
-    hidden_states = inputs_embeds + position_embeds
-
-    if token_type_ids is not None:
-        token_type_embeds = self.wte(token_type_ids)
-        hidden_states = hidden_states + token_type_embeds
-
-    hidden_states = self.drop(hidden_states)
-
-    output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    presents = () if use_cache else None
-    all_self_attentions = () if output_attentions else None
-    all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-    all_hidden_states = () if output_hidden_states else None
-    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-        # Model parallel
-        if self.model_parallel:
-            torch.cuda.set_device(hidden_states.device)
-            # Ensure layer_past is on same device as hidden_states (might not be correct)
-            if layer_past is not None:
-                layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
-            # Ensure that attention_mask is always on the same device as hidden_states
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(hidden_states.device)
-            if isinstance(head_mask, torch.Tensor):
-                head_mask = head_mask.to(hidden_states.device)
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            outputs = self._gradient_checkpointing_func(
-                block.__call__,
-                hidden_states,
-                None,
-                attention_mask,
-                head_mask[i],
-                encoder_hidden_states,
-                encoder_attention_mask,
-                use_cache,
-                output_attentions,
-                None,
-            )
-        else:
-            outputs = block(
-                hidden_states,
-                layer_past=layer_past,
-                attention_mask=attention_mask,
-                head_mask=head_mask[i],
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                token_idx=token_idx,
-            )
-
-        hidden_states = outputs[0]
-        if use_cache is True:
-            presents = presents + (outputs[1],)
-
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-            if self.config.add_cross_attention:
-                all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
-
-        # Model Parallel: If it's the last layer for that device, put things on the next device
-        if self.model_parallel:
-            for k, v in self.device_map.items():
-                if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                    hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-    hidden_states = self.ln_f(hidden_states)
-
-    hidden_states = hidden_states.view(output_shape)
-    # Add last hidden state
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(
-            v
-            for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
-            if v is not None
-        )
-
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-        cross_attentions=all_cross_attentions,
-    )
-
-
-class GaudiGPT2LMHeadModel(GPT2LMHeadModel):
-    """
-    Copied from GPT2LMHeadModel: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
-    The only differences are:
-    - add new args token_idx
-    """
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, token_idx=None, **kwargs
-    ):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # Omit tokens covered by past_key_values
-        if past_key_values:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                past_length = past_key_values[0][0].shape[2]
-
-                # Some generation methods already pass only the last input ID
-                if input_ids.shape[1] > past_length:
-                    remove_prefix_length = past_length
-                else:
-                    # Default to old behavior: keep only final ID
-                    remove_prefix_length = input_ids.shape[1] - 1
-
-                input_ids = input_ids[:, remove_prefix_length:]
-
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        else:
-            position_ids = None
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "token_type_ids": token_type_ids,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-        hidden_states = transformer_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-            cross_attentions=transformer_outputs.cross_attentions,
-        )
diff --git a/optimum/habana/transformers/models/gpt_bigcode/__init__.py b/optimum/habana/transformers/models/gpt_bigcode/__init__.py
deleted file mode 100644
index 556f61f8c7..0000000000
--- a/optimum/habana/transformers/models/gpt_bigcode/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_gpt_bigcode import (
-    GaudiGPTBigCodeForCausalLM,
-    gaudi_gpt_bigcode_attention_forward,
-    gaudi_gpt_bigcode_block_forward,
-    gaudi_gpt_bigcode_model_forward,
-)
diff --git a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
deleted file mode 100644
index e35b4cac52..0000000000
--- a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ /dev/null
@@ -1,495 +0,0 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeForCausalLM
-
-from ...modeling_attn_mask_utils import GaudiAttentionMaskConverter
-
-
-def gaudi_gpt_bigcode_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    layer_past: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.Tensor] = None,
-    use_cache: Optional[bool] = False,
-    output_attentions: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[
-    Tuple[torch.Tensor, Optional[torch.Tensor]],
-    Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
-]:
-    """
-    Copied from GPTBigCodeAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
-    The only differences are:
-    - add new args token_idx
-    - optimize KV cache
-    """
-    if encoder_hidden_states is not None:
-        if not hasattr(self, "q_attn") or not self.is_cross_attention:
-            raise ValueError(
-                "If class is used as cross attention, the weights `q_attn` have to be defined. "
-                "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
-            )
-
-        query = self.q_attn(hidden_states)
-        key_value = self.c_attn(encoder_hidden_states)
-        attention_mask = encoder_attention_mask
-    elif self.multi_query:
-        query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
-    else:
-        # Note: We split as (self.num_heads, 3, self.head_dim) instead of (3, self.num_heads, self.head_dim),
-        # i.e., the memory layout is not the same as GPT2.
-        # This makes the concatenation with past_key_value more efficient.
-        query, key_value = (
-            self.c_attn(hidden_states)
-            .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
-            .transpose(1, 2)
-            .split((self.head_dim, 2 * self.head_dim), dim=3)
-        )
-
-    key, value = key_value.split((self.head_dim, self.head_dim), dim=-1)
-
-    if layer_past is not None:
-        past_key, past_value = layer_past.split((self.head_dim, self.head_dim), dim=-1)
-        if token_idx is not None:
-            key = past_key.index_add_(1, token_idx - 1, key - torch.index_select(past_key, 1, token_idx - 1))
-            value = past_value.index_add_(1, token_idx - 1, value - torch.index_select(past_value, 1, token_idx - 1))
-        else:
-            key = torch.cat((past_key, key), dim=-2)
-            value = torch.cat((past_value, value), dim=-2)
-    present = torch.cat((key, value), dim=-1) if use_cache else None
-
-    attn_output, attn_weights = self._attn(query, key.transpose(-1, -2), value, attention_mask, head_mask)
-
-    if not self.multi_query:
-        attn_output = attn_output.transpose(1, 2).reshape(hidden_states.shape)
-    attn_output = self.c_proj(attn_output)
-    attn_output = self.resid_dropout(attn_output)
-
-    outputs = (attn_output, present)
-    if output_attentions:
-        if self.multi_query:
-            # Transpose to return weights in the usual format (batch_size, num_heads, query_length, key_length)
-            attn_weights = attn_weights.transpose(1, 2)
-        outputs += (attn_weights,)
-
-    return outputs  # a, present, (attentions)
-
-
-def gaudi_gpt_bigcode_block_forward(
-    self,
-    hidden_states: Optional[Tuple[torch.Tensor]],
-    layer_past: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.Tensor] = None,
-    use_cache: Optional[bool] = False,
-    output_attentions: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
-    """
-    Copied from GPTBigCodeBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
-    The only differences are:
-    - add new args token_idx
-    """
-    residual = hidden_states
-    hidden_states = self.ln_1(hidden_states)
-    attn_outputs = self.attn(
-        hidden_states,
-        layer_past=layer_past,
-        attention_mask=attention_mask,
-        head_mask=head_mask,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
-    outputs = attn_outputs[1:]
-    # residual connection
-    hidden_states = attn_output + residual
-
-    if encoder_hidden_states is not None:
-        # add one self-attention block for cross-attention
-        if not hasattr(self, "crossattention"):
-            raise ValueError(
-                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
-                "cross-attention layers by setting `config.add_cross_attention=True`"
-            )
-        residual = hidden_states
-        hidden_states = self.ln_cross_attn(hidden_states)
-        cross_attn_outputs = self.crossattention(
-            hidden_states,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-        )
-        attn_output = cross_attn_outputs[0]
-        # residual connection
-        hidden_states = residual + attn_output
-        outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
-
-    residual = hidden_states
-    hidden_states = self.ln_2(hidden_states)
-    feed_forward_hidden_states = self.mlp(hidden_states)
-    # residual connection
-    hidden_states = residual + feed_forward_hidden_states
-
-    if use_cache:
-        outputs = (hidden_states,) + outputs
-    else:
-        outputs = (hidden_states,) + outputs[1:]
-
-    return outputs  # hidden_states, present, (attentions, cross_attentions)
-
-
-def gaudi_gpt_bigcode_model_forward(
-    self,
-    input_ids: Optional[torch.Tensor] = None,
-    past_key_values: Optional[List[torch.Tensor]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    token_type_ids: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    inputs_embeds: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.Tensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-    """
-    Copied from GPTBigCodeModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
-    The only differences are:
-    - add new args token_idx
-    - if token_idx and past_key_values are passed, set self_attention_mask based on the static shape of past_key_values
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-        batch_size = input_ids.shape[0]
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-        batch_size = inputs_embeds.shape[0]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    if batch_size <= 0:
-        raise ValueError("batch_size has to be defined and > 0")
-
-    device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-    if token_type_ids is not None:
-        token_type_ids = token_type_ids.view(-1, input_shape[-1])
-
-    if past_key_values is None:
-        past_length = 0
-        past_key_values = tuple([None] * len(self.h))
-    else:
-        past_length = past_key_values[0].size(-2)
-
-    if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
-        # create position_ids on the fly for batch generation
-        position_ids = attention_mask.long().cumsum(-1) - 1
-        position_ids.masked_fill_(attention_mask == 0, 1)
-        if past_length > 0:
-            position_ids = position_ids[:, past_length : input_shape[-1] + past_length :]
-    elif position_ids is None:
-        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-        position_ids = position_ids.unsqueeze(0)
-
-    # Self-attention mask.
-    query_length = input_shape[-1]
-    key_length = past_length + query_length
-    if past_length > 0 and token_idx is not None:
-        self_attention_mask = self.bias[None, past_length - 1 : past_length, :past_length]
-    else:
-        self_attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
-
-    if attention_mask is not None:
-        self_attention_mask = self_attention_mask * attention_mask.view(batch_size, 1, -1).to(
-            dtype=torch.bool, device=self_attention_mask.device
-        )
-
-    # MQA models: (batch_size, query_length, n_heads, key_length)
-    # MHA models: (batch_size, n_heads, query_length, key_length)
-    self_attention_mask = self_attention_mask.unsqueeze(2 if self.multi_query else 1)
-
-    if self._use_sdpa and head_mask is None and not output_attentions:
-        # SDPA with a custom mask is much faster in fp16/fp32 dtype rather than bool. Cast here to floating point instead of at every layer.
-        dtype = self.wte.weight.dtype
-        min_dtype = torch.finfo(dtype).min
-        self_attention_mask = torch.where(
-            self_attention_mask,
-            torch.full([], 0.0, dtype=dtype, device=self_attention_mask.device),
-            torch.full([], min_dtype, dtype=dtype, device=self_attention_mask.device),
-        )
-
-        # output_attentions=True can not be supported when using SDPA, and we fall back on
-        # the manual implementation that requires a 4D causal mask in all cases.
-        if self.multi_query:
-            # gpt_bigcode using MQA has the bad taste to use a causal mask with shape
-            # [batch_size, target_length, 1, source_length], not compatible with SDPA, hence this transpose.
-            self_attention_mask = self_attention_mask.transpose(1, 2)
-
-        if query_length > 1 and attention_mask is not None:
-            # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
-            # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
-            self_attention_mask = GaudiAttentionMaskConverter._unmask_unattended(
-                self_attention_mask, min_dtype=min_dtype
-            )
-
-    attention_mask = self_attention_mask
-
-    # If a 2D or 3D attention mask is provided for the cross-attention
-    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-    if self.config.add_cross_attention and encoder_hidden_states is not None and encoder_attention_mask is not None:
-        if encoder_attention_mask.dim() == 2:
-            encoder_attention_mask.unsqueeze(1)
-        assert encoder_attention_mask.dim() == 3
-        encoder_attention_mask = encoder_attention_mask.bool().unsqueeze(2 if self.multi_query else 1)
-    else:
-        encoder_attention_mask = None
-
-    # Prepare head mask if needed
-    # 1.0 in head_mask indicate we keep the head
-    # attention_probs has shape bsz x n_heads x N x N
-    # head_mask has shape n_layer x batch x n_heads x N x N
-    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.wte(input_ids)
-    position_embeds = self.wpe(position_ids)
-    hidden_states = inputs_embeds + position_embeds
-
-    if token_type_ids is not None:
-        token_type_embeds = self.wte(token_type_ids)
-        hidden_states = hidden_states + token_type_embeds
-
-    hidden_states = self.drop(hidden_states)
-
-    output_shape = input_shape + (hidden_states.size(-1),)
-
-    presents = [] if use_cache else None
-    all_self_attentions = () if output_attentions else None
-    all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-    all_hidden_states = () if output_hidden_states else None
-    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            outputs = self._gradient_checkpointing_func(
-                block.__call__,
-                hidden_states,
-                None,
-                attention_mask,
-                head_mask[i],
-                encoder_hidden_states,
-                encoder_attention_mask,
-                use_cache,
-                output_attentions,
-                None,
-            )
-        else:
-            outputs = block(
-                hidden_states,
-                layer_past=layer_past,
-                attention_mask=attention_mask,
-                head_mask=head_mask[i],
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                token_idx=token_idx,
-            )
-
-        hidden_states = outputs[0]
-        if use_cache:
-            presents.append(outputs[1])
-
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-            if self.config.add_cross_attention:
-                all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
-
-    hidden_states = self.ln_f(hidden_states)
-
-    hidden_states = hidden_states.view(output_shape)
-    # Add last hidden state
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(
-            v
-            for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
-            if v is not None
-        )
-
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-        cross_attentions=all_cross_attentions,
-    )
-
-
-class GaudiGPTBigCodeForCausalLM(GPTBigCodeForCausalLM):
-    """
-    Inherits from GPTBigCodeForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
-    The only differences are:
-    - add new args token_idx
-    - add token_idx into model_inputs
-    - when KV cache is enabled, slice next_input_ids from input_ids based on the token_idx
-    - when KV cache is enabled, slice next_position_ids from position_ids based on the token_idx
-    """
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, token_idx=None, **kwargs
-    ):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # Omit tokens covered by past_key_values
-        if past_key_values:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-                if token_type_ids is not None:
-                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1)
-            else:
-                if self.config.multi_query:
-                    past_length = past_key_values[0].shape[1]
-                else:
-                    past_length = past_key_values[0].shape[2]
-
-                # Some generation methods already pass only the last input ID
-                if input_ids.shape[1] > past_length:
-                    remove_prefix_length = past_length
-                else:
-                    # Default to old behavior: keep only final ID
-                    remove_prefix_length = input_ids.shape[1] - 1
-
-                input_ids = input_ids[:, remove_prefix_length:]
-                if token_type_ids is not None:
-                    token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-        else:
-            position_ids = None
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "token_type_ids": token_type_ids,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-            cross_attentions=transformer_outputs.cross_attentions,
-        )
diff --git a/optimum/habana/transformers/models/gpt_neox/__init__.py b/optimum/habana/transformers/models/gpt_neox/__init__.py
deleted file mode 100644
index cceb114b82..0000000000
--- a/optimum/habana/transformers/models/gpt_neox/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .modeling_gpt_neox import (
-    GaudiGPTNeoXForCausalLM,
-    gaudi_gpt_neox_attention_forward,
-    gaudi_gpt_neox_layer_forward,
-    gaudi_gpt_neox_model_forward,
-    gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache,
-)
diff --git a/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py b/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py
deleted file mode 100644
index b43a5c3237..0000000000
--- a/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ /dev/null
@@ -1,445 +0,0 @@
-from typing import Optional, Tuple, Union
-
-import torch
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, apply_rotary_pos_emb, logger
-
-
-try:
-    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
-except ImportError:
-    print("Not using HPU fused kernel for apply_rotary_pos_emb")
-    FusedRoPE = None
-
-
-def gaudi_gpt_neox_attention_forward(
-    self,
-    hidden_states: torch.FloatTensor,
-    attention_mask: torch.FloatTensor,
-    position_ids: torch.LongTensor,
-    head_mask: Optional[torch.FloatTensor] = None,
-    layer_past: Optional[Tuple[torch.Tensor]] = None,
-    use_cache: Optional[bool] = False,
-    output_attentions: Optional[bool] = False,
-    padding_mask: Optional[torch.Tensor] = None,
-    token_idx: Optional[torch.Tensor] = None,
-):
-    """
-    Copied from GPTNeoXAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
-    The only differences are:
-    - add new args token_idx
-    - optimize KV cache
-    """
-    # Workaround till FusedRoPE is fixed
-    global FusedRoPE
-    if self.training and FusedRoPE is not None:
-        FusedRoPE = None
-
-    has_layer_past = layer_past is not None
-
-    # Compute QKV
-    # Attention heads [batch, seq_len, hidden_size]
-    #   --> [batch, seq_len, (np * 3 * head_size)]
-    qkv = self.query_key_value(hidden_states)
-
-    # [batch, seq_len, (num_heads * 3 * head_size)]
-    #   --> [batch, seq_len, num_heads, 3 * head_size]
-    new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
-    qkv = qkv.view(*new_qkv_shape)
-
-    # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
-    query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
-    key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
-    value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
-
-    # Compute rotary embeddings on rotary_ndims
-    query_rot = query[..., : self.rotary_ndims]
-    query_pass = query[..., self.rotary_ndims :]
-    key_rot = key[..., : self.rotary_ndims]
-    key_pass = key[..., self.rotary_ndims :]
-
-    # Compute token offset for rotary embeddings (when decoding)
-    seq_len = key.shape[-2]
-    if has_layer_past:
-        seq_len += layer_past[0].shape[-2]
-    cos, sin = self.rotary_emb(value, seq_len=seq_len)
-    query, key = apply_customized_rope(query_rot, key_rot, cos, sin, position_ids)
-    query = torch.cat((query, query_pass), dim=-1).contiguous()
-    key = torch.cat((key, key_pass), dim=-1).contiguous()
-    value = value.contiguous()
-
-    # Cache QKV values
-    if has_layer_past:
-        past_key = layer_past[0]
-        past_value = layer_past[1]
-        if token_idx is not None:
-            past_key.index_copy_(2, token_idx - 1, key)
-            past_value.index_copy_(2, token_idx - 1, value)
-            key = past_key
-            value = past_value
-        else:
-            key = torch.cat((past_key, key), dim=-2)
-            value = torch.cat((past_value, value), dim=-2)
-
-    present = (key, value) if use_cache else None
-
-    # Compute attention
-    attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-    # Reshape outputs
-    attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
-    attn_output = self.dense(attn_output)
-
-    outputs = (attn_output, present)
-    if output_attentions:
-        outputs += (attn_weights,)
-
-    return outputs
-
-
-def gaudi_gpt_neox_layer_forward(
-    self,
-    hidden_states: Optional[torch.FloatTensor],
-    attention_mask: Optional[torch.FloatTensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = False,
-    layer_past: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-):
-    """
-    Copied from GPTNeoxLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
-    The only differences are:
-    - add new args token_idx
-    """
-    attention_layer_outputs = self.attention(
-        self.input_layernorm(hidden_states),
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        layer_past=layer_past,
-        head_mask=head_mask,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
-    attn_output = self.post_attention_dropout(attn_output)
-    outputs = attention_layer_outputs[1:]
-
-    if self.use_parallel_residual:
-        # pseudocode:
-        # x = x + attn(ln1(x)) + mlp(ln2(x))
-        mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
-        mlp_output = self.post_mlp_dropout(mlp_output)
-        hidden_states = mlp_output + attn_output + hidden_states
-    else:
-        # pseudocode:
-        # x = x + attn(ln1(x))
-        # x = x + mlp(ln2(x))
-        attn_output = attn_output + hidden_states
-        mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
-        mlp_output = self.post_mlp_dropout(mlp_output)
-        hidden_states = mlp_output + attn_output
-
-    if use_cache:
-        outputs = (hidden_states,) + outputs  # hidden_states, present, (attn_weights)
-    else:
-        outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)
-
-    return outputs
-
-
-def gaudi_gpt_neox_model_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from GPTNeoxModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
-    The only differences are:
-    - add new args token_idx
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-        input_shape = input_ids.size()
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    batch_size, seq_length = input_shape
-
-    if past_key_values is None:
-        past_length = 0
-        past_key_values = tuple([None] * self.config.num_hidden_layers)
-    else:
-        past_length = past_key_values[0][0].size(-2)
-
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
-        position_ids = position_ids.unsqueeze(0)
-
-    # Attention mask.
-    if attention_mask is not None:
-        assert batch_size > 0, "batch_size has to be defined and > 0"
-        attention_mask = attention_mask.view(batch_size, -1)
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        attention_mask = attention_mask[:, None, None, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and the dtype's smallest value for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-    # Prepare head mask if needed
-    # 1.0 in head_mask indicate we keep the head
-    # attention_probs has shape bsz x n_heads x N x N
-    # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-    # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-    head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_in(input_ids)
-
-    hidden_states = self.emb_dropout(inputs_embeds)
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    presents = () if use_cache else None
-    all_attentions = () if output_attentions else None
-    all_hidden_states = () if output_hidden_states else None
-    for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            outputs = self._gradient_checkpointing_func(
-                layer.__call__,
-                hidden_states,
-                attention_mask,
-                position_ids,
-                head_mask[i],
-                use_cache,
-                None,
-                output_attentions,
-                None,
-            )
-        else:
-            outputs = layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                head_mask=head_mask[i],
-                layer_past=layer_past,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                token_idx=token_idx,
-            )
-        hidden_states = outputs[0]
-        if use_cache is True:
-            presents = presents + (outputs[1],)
-        if output_attentions:
-            all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
-
-    hidden_states = self.final_layer_norm(hidden_states)
-    # Add last hidden state
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
-
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_attentions,
-    )
-
-
-class GaudiGPTNeoXForCausalLM(GPTNeoXForCausalLM):
-    """
-    Inherits from GPTNeoXForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt_neox/modeling_gpt_neox.py
-    The only differences are:
-    - add new args token_idx
-    - add token_idx into model_inputs
-    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-    """
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.gpt_neox(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-
-        hidden_states = outputs[0]
-        lm_logits = self.embed_out(hidden_states)
-
-        lm_loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shift_logits = lm_logits[:, :-1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return ((lm_loss,) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, token_idx=None, **kwargs
-    ):
-        input_shape = input_ids.shape
-
-        # cut decoder_input_ids if past is used
-        if past_key_values is not None:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                past_length = past_key_values[0][0].shape[2]
-
-                # Some generation methods already pass only the last input ID
-                if input_ids.shape[1] > past_length:
-                    remove_prefix_length = past_length
-                else:
-                    # Default to old behavior: keep only final ID
-                    remove_prefix_length = input_ids.shape[1] - 1
-
-                input_ids = input_ids[:, remove_prefix_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "attention_mask": attention_mask,
-                "past_key_values": past_key_values,
-                "position_ids": position_ids,
-                "token_idx": token_idx,
-            }
-        )
-
-        return model_inputs
-
-
-def gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache(self, seq_len, device, dtype):
-    self.max_seq_len_cached = seq_len
-    t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-    freqs = torch.outer(t, self.inv_freq)
-    # Different from paper, but it uses a different permutation in order to obtain the same calculation
-    emb = torch.cat((freqs, freqs), dim=-1)
-    self.cos_cached = emb.cos()
-    self.sin_cached = emb.sin()
-
-
-def apply_customized_rope(q, k, cos, sin, position_ids):
-    if q.device.type == "hpu" and FusedRoPE:
-        if q.dtype == torch.bfloat16:
-            rope_q = FusedRoPE.apply(
-                q,
-                cos.unsqueeze(0).unsqueeze(0).to(torch.bfloat16),
-                sin.unsqueeze(0).unsqueeze(0).to(torch.bfloat16),
-                position_ids,
-            )
-        else:
-            rope_q = FusedRoPE.apply(q, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids)
-        if k.dtype == torch.bfloat16:
-            rope_k = FusedRoPE.apply(
-                k,
-                cos.unsqueeze(0).unsqueeze(0).to(torch.bfloat16),
-                sin.unsqueeze(0).unsqueeze(0).to(torch.bfloat16),
-                position_ids,
-            )
-        else:
-            rope_k = FusedRoPE.apply(k, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids)
-        return rope_q, rope_k
-    else:
-        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
diff --git a/optimum/habana/transformers/models/gptj/__init__.py b/optimum/habana/transformers/models/gptj/__init__.py
deleted file mode 100644
index 23a1d6971b..0000000000
--- a/optimum/habana/transformers/models/gptj/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_gptj import (
-    GaudiGPTJAttention,
-    GaudiGPTJBlock,
-    GaudiGPTJForCausalLM,
-    gaudi_gptj_model_forward,
-)
diff --git a/optimum/habana/transformers/models/gptj/modeling_gptj.py b/optimum/habana/transformers/models/gptj/modeling_gptj.py
deleted file mode 100644
index 0fae0f0467..0000000000
--- a/optimum/habana/transformers/models/gptj/modeling_gptj.py
+++ /dev/null
@@ -1,539 +0,0 @@
-from typing import Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.gptj.modeling_gptj import (
-    GPTJMLP,
-    GPTJAttention,
-    GPTJForCausalLM,
-    apply_rotary_pos_emb,
-    create_sinusoidal_positions,
-    logger,
-)
-
-
-class GaudiGPTJAttention(GPTJAttention):
-    def _attn(
-        self,
-        query,
-        key,
-        value,
-        attention_mask=None,
-        head_mask=None,
-    ):
-        # compute causal mask from causal mask buffer
-        query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
-
-        query = query.contiguous()
-        key = key.contiguous()
-        value = value.contiguous()
-
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        mask_value = torch.finfo(attn_weights.dtype).min
-        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
-        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        attn_weights = attn_weights / self.scale_attn
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_weights = attn_weights.to(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-        cos: Optional[torch.Tensor] = None,
-    ) -> Union[
-        Tuple[torch.Tensor, Tuple[torch.Tensor]],
-        Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
-    ]:
-        """
-        Copied from GPTJAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
-        The only differences are:
-        - add new args token_idx
-        - remove is_torch_fx_proxy
-        - optimize KV cache
-        - pass sin and cos from upper level as they are identical for each attn block
-        """
-        query = self.q_proj(hidden_states)
-        key = self.k_proj(hidden_states)
-        value = self.v_proj(hidden_states)
-
-        query = self._split_heads(query, self.num_attention_heads, self.head_dim, True).contiguous()
-        key = self._split_heads(key, self.num_attention_heads, self.head_dim, True).contiguous()
-        value = self._split_heads(value, self.num_attention_heads, self.head_dim, False).contiguous()
-
-        if self.rotary_dim is not None:
-            k_rot = key[:, :, :, : self.rotary_dim]
-            k_pass = key[:, :, :, self.rotary_dim :]
-
-            q_rot = query[:, :, :, : self.rotary_dim]
-            q_pass = query[:, :, :, self.rotary_dim :]
-            # Note: it appears that if we use bf16 RoPE(whether use fused kernel or not), there could be acc issue, hence use fp32 RoPE here Fused kernel feasibility needs to be confirmed in the future
-            k_rot = apply_rotary_pos_emb(k_rot.to(torch.float32), sin, cos).to(torch.bfloat16)
-            q_rot = apply_rotary_pos_emb(q_rot.to(torch.float32), sin, cos).to(torch.bfloat16)
-
-            key = torch.cat([k_rot, k_pass], dim=-1)
-            query = torch.cat([q_rot, q_pass], dim=-1)
-        else:
-            key = apply_rotary_pos_emb(key.to(torch.float32), sin, cos).to(torch.bfloat16)
-            query = apply_rotary_pos_emb(query.to(torch.float32), sin, cos).to(torch.bfloat16)
-
-        key = key.permute(0, 2, 1, 3).contiguous()
-        query = query.permute(0, 2, 1, 3).contiguous()
-
-        if layer_past is not None:
-            past_key = layer_past[0]
-            past_value = layer_past[1]
-
-            if token_idx is not None:
-                past_key.index_copy_(2, token_idx - 1, key)
-                past_value.index_copy_(2, token_idx - 1, value)
-                key = past_key
-                value = past_value
-            else:
-                key = torch.cat([past_key, key], dim=-2)
-                value = torch.cat([past_value, value], dim=-2)
-
-        if use_cache is True:
-            # Note that this cast is quite ugly, but is not implemented before ROPE as the original codebase keeps the key in float32 all along the computation.
-            # Reference: https://github.com/kingoflolz/mesh-transformer-jax/blob/f8315e3003033b23f21d78361b288953064e0e76/mesh_transformer/layers.py#L128
-            present = (key.to(hidden_states.dtype), value)
-        else:
-            present = None
-
-        # compute self-attention: V x Softmax(QK^T)
-        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
-        attn_output = self.out_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs  # a, present, (attentions)
-
-
-class GaudiGPTJBlock(torch.nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
-        self.ln_1 = torch.nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.attn = GaudiGPTJAttention(config)
-        self.mlp = GPTJMLP(inner_dim, config)
-
-    def forward(
-        self,
-        hidden_states: Optional[torch.FloatTensor],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-        cos: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        """
-        Copied from GPTJBlock.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
-        The only differences are:
-        - add new args token_idx
-        - pass sin and cos from upper level as they are identical for each attn block
-        """
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        attn_outputs = self.attn(
-            hidden_states=hidden_states,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            token_idx=token_idx,
-            sin=sin,
-            cos=cos,
-        )
-        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
-        outputs = attn_outputs[1:]
-
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        hidden_states = attn_output + feed_forward_hidden_states + residual
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs  # hidden_states, present, (attentions)
-
-
-def gaudi_gptj_model_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    token_type_ids: Optional[torch.LongTensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-    sin: Optional[torch.Tensor] = None,
-    cos: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from GPTJModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
-    The only differences are:
-    - add new args token_idx
-    - pass sin and cos from upper level as they are identical for each attn block
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-        batch_size = input_ids.shape[0]
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-        batch_size = inputs_embeds.shape[0]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-    if token_type_ids is not None:
-        token_type_ids = token_type_ids.view(-1, input_shape[-1])
-
-    if past_key_values is None:
-        past_length = 0
-        past_key_values = tuple([None] * len(self.h))
-    else:
-        past_length = past_key_values[0][0].size(-2)
-
-    if position_ids is None:
-        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-        position_ids = position_ids.unsqueeze(0)
-
-    # Attention mask.
-    if attention_mask is not None:
-        if batch_size <= 0:
-            raise ValueError("batch_size has to be defined and > 0")
-        attention_mask = attention_mask.view(batch_size, -1)
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        attention_mask = attention_mask[:, None, None, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and the dtype's smallest value for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-    # Prepare head mask if needed
-    # 1.0 in head_mask indicate we keep the head
-    # attention_probs has shape bsz x num_attention_heads x N x N
-    # head_mask has shape n_layer x batch x num_attention_heads x N x N
-    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.wte(input_ids)
-
-    hidden_states = inputs_embeds
-
-    if token_type_ids is not None:
-        token_type_embeds = self.wte(token_type_ids)
-        hidden_states = hidden_states + token_type_embeds
-
-    hidden_states = self.drop(hidden_states)
-
-    output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    presents = () if use_cache else None
-    all_self_attentions = () if output_attentions else None
-    all_hidden_states = () if output_hidden_states else None
-
-    # replace original `_get_embed_positions` method and sin cos calculation in the attn block here to improve perf
-    rotary_dim = self.config.rotary_dim
-    embed_dim = self.config.hidden_size
-    pos_embd_dim = rotary_dim or embed_dim
-    max_positions = self.config.max_position_embeddings
-    embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim).to(torch.bfloat16)
-    embed_positions = embed_positions.repeat(position_ids.shape[0], 1, 1)
-    if embed_positions.device != position_ids.device:
-        embed_positions = embed_positions.to(position_ids.device)
-    repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1])
-    sincos = torch.gather(embed_positions, 1, repeated_position_ids)
-    sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
-    sin = sin.contiguous()
-    cos = cos.contiguous()
-
-    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-        # Model parallel
-        if self.model_parallel:
-            torch.cuda.set_device(hidden_states.device)
-            # Ensure layer_past is on same device as hidden_states (might not be correct)
-            if layer_past is not None:
-                layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
-            # Ensure that attention_mask is always on the same device as hidden_states
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(hidden_states.device)
-            if isinstance(head_mask, torch.Tensor):
-                head_mask = head_mask.to(hidden_states.device)
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            outputs = self._gradient_checkpointing_func(
-                block.__call__,
-                hidden_states,
-                None,
-                attention_mask,
-                position_ids,
-                head_mask[i],
-                use_cache,
-                output_attentions,
-                None,
-                sin,
-                cos,
-            )
-        else:
-            outputs = block(
-                hidden_states=hidden_states,
-                layer_past=layer_past,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                head_mask=head_mask[i],
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                token_idx=token_idx,
-                sin=sin,
-                cos=cos,
-            )
-
-        hidden_states = outputs[0]
-        if use_cache is True:
-            presents = presents + (outputs[1],)
-
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        # Model Parallel: If it's the last layer for that device, put things on the next device
-        if self.model_parallel:
-            for k, v in self.device_map.items():
-                if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                    hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-    hidden_states = self.ln_f(hidden_states)
-
-    hidden_states = hidden_states.view(output_shape)
-    # Add last hidden state
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-    )
-
-
-class GaudiGPTJForCausalLM(GPTJForCausalLM):
-    """
-    Inherits from GPTJForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/modeling_gptj.py
-    The only differences are:
-    - add new args token_idx
-    - add token_idx into model_inputs
-    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-    - from step2 when enable KV cache, slice next_token_type_ids from token_type_ids base on the token_idx
-    """
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, token_idx=None, **kwargs
-    ):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # Omit tokens covered by past_key_values
-        if past_key_values:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                past_length = past_key_values[0][0].shape[2]
-
-                # Some generation methods already pass only the last input ID
-                if input_ids.shape[1] > past_length:
-                    remove_prefix_length = past_length
-                else:
-                    # Default to old behavior: keep only final ID
-                    remove_prefix_length = input_ids.shape[1] - 1
-
-                input_ids = input_ids[:, remove_prefix_length:]
-
-            if token_type_ids is not None:
-                if token_idx is not None:
-                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1)
-                else:
-                    token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "token_type_ids": token_type_ids,
-                "token_idx": token_idx,
-            }
-        )
-
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-        hidden_states = transformer_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
-
-        # make sure sampling in fp16 works correctly and
-        # compute loss in fp32 to match with mesh-tf version
-        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
-        lm_logits = self.lm_head(hidden_states).to(torch.float32)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/optimum/habana/transformers/models/llama/__init__.py b/optimum/habana/transformers/models/llama/__init__.py
deleted file mode 100644
index 0a8758d894..0000000000
--- a/optimum/habana/transformers/models/llama/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .configuration_llama import LlamaConfig
-from .modeling_llama import (
-    GaudiLlamaAttention,
-    GaudiLlamaDecoderLayer,
-    GaudiLlamaDynamicNTKScalingRotaryEmbedding,
-    GaudiLlamaForCausalLM,
-    GaudiLlamaLinearScalingRotaryEmbedding,
-    GaudiLlamaMLP,
-    GaudiLlamaModel,
-    GaudiLlamaRotaryEmbedding,
-    gaudi_llama_rmsnorm_forward,
-)
diff --git a/optimum/habana/transformers/models/llama/configuration_llama.py b/optimum/habana/transformers/models/llama/configuration_llama.py
deleted file mode 100644
index 7a92a35c76..0000000000
--- a/optimum/habana/transformers/models/llama/configuration_llama.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# TODO: To remove when the repo is upgraded to Transformers >= 4.41.0
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-
-class LlamaConfig(LlamaConfig):
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        mlp_bias=False,
-        fused_qkv=False,
-        parallel_strategy=None,
-        flash_attention_fp8=False,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_size,
-            hidden_size,
-            intermediate_size,
-            num_hidden_layers,
-            num_attention_heads,
-            num_key_value_heads,
-            hidden_act,
-            max_position_embeddings,
-            initializer_range,
-            rms_norm_eps,
-            use_cache,
-            pad_token_id,
-            bos_token_id,
-            eos_token_id,
-            pretraining_tp,
-            tie_word_embeddings,
-            rope_theta,
-            rope_scaling,
-            attention_bias,
-            attention_dropout,
-            **kwargs,
-        )
-
-        self.mlp_bias = mlp_bias
-        self.fused_qkv = fused_qkv
-        self.parallel_strategy = parallel_strategy
-        self.flash_attention_fp8 = flash_attention_fp8
diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
deleted file mode 100755
index ea47de38ed..0000000000
--- a/optimum/habana/transformers/models/llama/modeling_llama.py
+++ /dev/null
@@ -1,1373 +0,0 @@
-import copy
-import math
-import os
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-from torch.distributed.distributed_c10d import ProcessGroup
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import (
-    LlamaAttention,
-    LlamaDecoderLayer,
-    LlamaForCausalLM,
-    LlamaMLP,
-    LlamaModel,
-    LlamaRMSNorm,
-    apply_rotary_pos_emb,
-    logger,
-)
-
-from .... import distributed
-from ....distributed.strategy import DistributedStrategy, NoOpStrategy
-from ....distributed.tensorparallel import (
-    reduce_from_tensor_model_parallel_region,
-)
-from ....distributed.tp import TPModule
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-try:
-    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
-
-    has_fused_rope = True
-except ImportError:
-    has_fused_rope = False
-    print("Not using HPU fused kernel for apply_rotary_pos_emb")
-
-try:
-    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
-
-    has_fused_rms_norm = True
-except ImportError:
-    has_fused_rms_norm = False
-    print("Not using HPU fused kernel for RMSNorm")
-
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-except ImportError:
-    print("Not using HPU fused scaled dot-product attention kernel.")
-    FusedSDPA = None
-
-import habana_frameworks.torch.core as htcore
-
-
-def gaudi_llama_rmsnorm_forward(self, hidden_states):
-    """
-    Copied from LlamaRMSNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-    The only differences are:
-        - override RMSNorm with Habana fused RMSNorm
-    """
-    if hidden_states.device.type == "hpu" and has_fused_rms_norm:
-        # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
-        if hidden_states.dtype != self.weight.dtype:
-            orig_dtype = hidden_states.dtype
-            hidden_states = FusedRMSNorm.apply(hidden_states.to(self.weight.dtype), self.weight, self.variance_epsilon)
-            return hidden_states.to(orig_dtype)
-        else:
-            hidden_states = FusedRMSNorm.apply(hidden_states, self.weight, self.variance_epsilon)
-            return hidden_states
-    else:
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class GaudiLlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        super().__init__()
-
-        self.scaling_factor = scaling_factor
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self._cos_cached[:seq_len].to(dtype=x.dtype),
-            self._sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-class GaudiLlamaLinearScalingRotaryEmbedding(GaudiLlamaRotaryEmbedding):
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-class GaudiLlamaDynamicNTKScalingRotaryEmbedding(GaudiLlamaRotaryEmbedding):
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-class GaudiLlamaMLP(LlamaMLP):
-    def __init__(self, config):
-        super(LlamaMLP, self).__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        bias = config.mlp_bias if hasattr(config, "mlp_bias") else False
-        self.gate_proj = torch.nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
-        self.up_proj = torch.nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
-        self.down_proj = torch.nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def pre_mlp_forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat(
-                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
-            )
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
-            ]
-            output = sum(down_proj)
-        else:
-            input = self.act_fn(self.gate_proj(x)) * self.up_proj(x)
-            output = self.down_proj(input)
-        return output
-
-    def mlp_all_reduce(self, x):
-        if hasattr(self.down_proj, "all_reduce"):
-            self.down_proj.all_reduce(x)
-
-    def post_mlp_forward(self, x):
-        if self.config.pretraining_tp > 1:
-            return x
-        if hasattr(self.down_proj, "post_all_reduce"):
-            return self.down_proj.post_all_reduce(x)
-        return x
-
-
-class TPGaudiLlamaMLP(GaudiLlamaMLP, TPModule):
-    def __init__(
-        self,
-        config,
-        group: Optional[ProcessGroup] = None,
-    ):
-        assert torch.distributed.is_initialized()
-        rank, world_size = distributed.rank_and_world(group)
-        hidden_dim = int(config.hidden_grow_factor * config.hidden_size)
-        assert hidden_dim % world_size == 0, "Hidden dim must be divisible by world size"
-
-        self.config = copy.deepcopy(config)
-        self.config.intermediate_size = int((config.hidden_grow_factor / world_size) * config.hidden_size)
-        GaudiLlamaMLP.__init__(self, self.config)
-        self.setup_tp(rank, world_size)
-
-    def colwise_param_names(self) -> List[str]:
-        return ["up_proj", "gate_proj"]
-
-    def rowwise_param_names(self) -> List[str]:
-        return ["down_proj"]
-
-    @staticmethod
-    def import_module(glu: GaudiLlamaMLP, group: ProcessGroup) -> "TPGaudiLlamaMLP":
-        config = copy.deepcopy(glu.config)
-        config.hidden_grow_factor = glu.config.intermediate_size / glu.config.hidden_size
-        tp_glu = TPGaudiLlamaMLP(config=config, group=group)
-        return tp_glu
-
-    def pre_mlp_forward(self, x):
-        out_par = GaudiLlamaMLP.pre_mlp_forward(self, x)
-        return reduce_from_tensor_model_parallel_region(out_par)
-
-
-def gaudi_llama_repeat_kv(
-    query_states: torch.Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
-    attention_mask: torch.Tensor,
-    n_rep: int,
-):
-    """
-    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-    The only differences are:
-        - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
-        - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
-    if n_rep == 1 or num_key_value_heads == 1:
-        return query_states, key_states, value_states, attention_mask
-
-    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
-    key_states = key_states.reshape(new_kv_shape)
-    value_states = value_states.reshape(new_kv_shape)
-
-    batch, _, q_len, head_dim = query_states.shape
-    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
-    query_states = query_states.reshape(new_q_shape)
-
-    if attention_mask is not None:
-        # Add groups dim and set to 1
-        attention_mask = attention_mask.unsqueeze(1)
-
-    return query_states, key_states, value_states, attention_mask
-
-#  FusedScaledDotProductAttention
-class ModuleFusedSDPA(torch.nn.Module):
-    def __init__(self, fusedSDPA, scale, attention_dropout, enable_recompute, flash_attention_fp8):
-        super().__init__()
-        self._hpu_kernel_fsdpa = fusedSDPA
-        self.scale = scale
-        self.attention_dropout = attention_dropout
-        self.enable_recompute = enable_recompute
-        self.flash_attention_fp8 = flash_attention_fp8
-
-    def forward(self, query, key, value, attn_mask, dropout_p, is_causal, scale, softmax_mode, recompute_mode, valid_sequence_lengths, padding_side="left"):
-        from habana_frameworks.torch.hpex.experimental.transformer_engine import FusedAttention as FusedAttentionTE
-        if isinstance(self._hpu_kernel_fsdpa, FusedAttentionTE):
-            return self._hpu_kernel_fsdpa(query, key, value, attn_mask, is_causal, softmax_mode)
-        else:
-            return self._hpu_kernel_fsdpa.apply(query, key, value, attn_mask, dropout_p, is_causal, scale, softmax_mode, recompute_mode, valid_sequence_lengths, padding_side)
-
-
-class Matmul(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
-class KVCache(torch.nn.Module):
-    def __init__(self):
-        super(KVCache, self).__init__()
-        self.cache = None
-        self.inp_seq_len = -1
-
-    def allocate(self, inp_seq_len, dtype, device, shape):
-        if self.cache is None or self.cache.shape != shape:
-            self.inp_seq_len = inp_seq_len
-            self.cache = torch.zeros(shape, dtype=dtype, device=device)
-        else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
-            self.cache.fill_(0)
-
-    @staticmethod
-    def update(prev, cur, dim, idx, inp_seq_len):
-        orig_cur = cur
-        if prev.shape == cur.shape:
-            prev.copy_(cur)
-            return orig_cur
-        if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
-            # Initialize
-            prev[:, :, :inp_seq_len, :].copy_(cur)
-            return orig_cur
-        assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
-        if idx is not None:
-            prev.index_copy_(dim, idx - 1, cur)
-            return prev
-        else:
-            return torch.cat((prev, cur), dim=dim)
-
-    def get_shape(self):
-        if self.cache is None:
-            return None
-        return self.cache.shape
-
-    def forward(self, cur, dim, idx):
-        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
-
-
-class GaudiLlamaAttention(LlamaAttention):
-    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-
-        self.matmul_qk = Matmul()
-        self.matmul_av = Matmul()
-        self.k_cache = KVCache()
-        self.v_cache = KVCache()
-        if config.fused_qkv:
-            self.num_heads = config.num_attention_heads
-            self.head_dim = config.hidden_size // self.num_heads
-            self.dim1 = self.num_heads * self.head_dim
-            self.dim2 = config.num_key_value_heads * self.head_dim
-            self.qkv_proj = torch.nn.Linear(
-                self.hidden_size,
-                self.dim1 + 2 * self.dim2,
-                bias=config.attention_bias,
-            )
-            self.q_proj = None
-            self.k_proj = None
-            self.v_proj = None
-        self.inp_seq_len = -1
-        self.norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
-            FusedSDPA,
-            scale=self.norm_factor,
-            attention_dropout=self.attention_dropout,
-            enable_recompute=False,
-            flash_attention_fp8=config.flash_attention_fp8,
-        ) if FusedSDPA else None
-
-    def get_k_proj_weight(self):
-        """4bit quantization in GPTQ replaces the k_proj.weight with qweight."""
-        if hasattr(self.k_proj, "qweight"):
-            return self.k_proj.qweight
-        return self.k_proj.weight
-
-    def get_k_proj_weight_dtype(self):
-        """ 4bit quantization in GPTQ replaces the k_proj.weight with qweight.
-            Scales tensor gets the weight dtype. """
-        if hasattr(self.k_proj, 'qweight'):
-            return self.k_proj.scales.dtype
-        return self.k_proj.weight.dtype
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim)
-        device = self.get_k_proj_weight().device
-        dtype = self.config.torch_dtype
-        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-
-    def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
-        # This helps in avoiding creation of these caches during actual model forward pass and
-        # reduce memory consumption and improve performance.
-        if seq_len > self.max_position_embeddings:
-            self.max_position_embeddings = seq_len
-            _, _ = self.rotary_emb(self.get_k_proj_weight(), seq_len=seq_len)
-
-    def reorder(self, tensor, beam_idx, dim_a, dim_b):
-        updated = tensor.index_select(0, beam_idx)
-        tensor.copy_(updated)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        if self.k_cache.cache is None:
-            return (None, None)
-
-        head_dim = self.k_cache.cache.size(-1)
-        seq_length = self.k_cache.cache.size(-2)
-        self.reorder(self.k_cache.cache, beam_idx, seq_length, head_dim)
-        self.reorder(self.v_cache.cache, beam_idx, seq_length, head_dim)
-        return (self.k_cache.cache.shape, self.v_cache.cache.shape)
-
-    def pre_attn_forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        flash_attention_fast_softmax: Optional[bool] = False,
-        valid_sequence_lengths: torch.Tensor = None,
-        cache_idx: int = None,
-        num_virtual_tokens: int = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """
-        Copied from LlamaAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-        The only differences are:
-        - add new args token_idx
-        - optimize KV cache
-        - add new args attn_softmax_bf16
-        - add new args reuse_cache
-        - add new args use_flash_attention
-        - add new arg flash_attention_recompute
-        - add new arg flash_attention_causal_mask
-        - add new arg flash_attention_fast_softmax
-        - add new arg num_virtual_tokens
-        """
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-            )
-            key_slices = self.get_k_proj_weight().split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            if self.config.fused_qkv:
-                qkv_states = self.qkv_proj(hidden_states)
-                query_states, key_states, value_states = torch.split(
-                    qkv_states, [self.dim1, self.dim2, self.dim2], dim=-1
-                )
-            else:
-                query_states = self.q_proj(hidden_states)
-                key_states = self.k_proj(hidden_states)
-                value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # TODO: update when auto mp params is enabled in DeepSpeed (cf. https://github.com/HabanaAI/DeepSpeed/blob/94309c7b5dfc1a69858f5c9f25737b2f81a332a5/deepspeed/module_inject/replace_module.py#L440)
-        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if token_idx is None:
-                if hasattr(past_key_value, "get_usable_length"):
-                    kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-                else:
-                    kv_seq_len += past_key_value[0].shape[-2]
-            else:
-                if reuse_cache and not isinstance(past_key_value[0], torch.Tensor):
-                    kv_seq_len = past_key_value[0][-2]
-                else:
-                    if num_virtual_tokens is not None and num_virtual_tokens == past_key_value[0].shape[-2]:
-                        kv_seq_len = past_key_value[0].shape[-2] + kv_seq_len
-                    else:
-                        kv_seq_len = past_key_value[0].shape[-2]
-
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
-        if use_cache:
-            # reuse k, v, self_attention
-            if reuse_cache:
-                if past_key_value is not None and isinstance(past_key_value[0], torch.Tensor):
-                    # prefix tuning case. attach past_key_value to generate first token.
-                    key_states = torch.cat((past_key_value[0], key_states), -2)
-                    value_states = torch.cat((past_key_value[1], value_states), -2)
-                key_states = self.k_cache(key_states, 2, token_idx)
-                value_states = self.v_cache(value_states, 2, token_idx)
-                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
-            else:
-                if past_key_value is None:
-                    past_key = torch.zeros(key_states.shape, dtype=self.get_k_proj_weight_dtype(), device=key_states.device)
-                    past_value = torch.zeros(
-                        key_states.shape, dtype=self.get_k_proj_weight_dtype(), device=key_states.device
-                    )
-                    # Return list instead of tuple
-                    past_key_value = [past_key, past_value]
-                if (
-                    token_idx is not None
-                    and num_virtual_tokens is not None
-                    and num_virtual_tokens == past_key_value[0].shape[-2]
-                ):
-                    # prefix tunining case. attach past_key_value to generate first token.
-                    key_states = torch.cat((past_key_value[0], key_states), -2)
-                    value_states = torch.cat((past_key_value[1], value_states), -2)
-                    past_key_value = (key_states, value_states)
-                else:
-                    key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
-                    value_states = self.v_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
-
-                if token_idx is None:
-                    past_key_value = (key_states, value_states)
-
-            if cache_idx is not None and q_len == 1:
-                key_states = key_states[:, :, :cache_idx, :]
-                value_states = value_states[:, :, :cache_idx, :]
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, :, :, :cache_idx]
-                kv_seq_len = key_states.shape[-2]
-        else:
-            past_key_value = None
-
-        if use_flash_attention and FusedSDPA:
-            if q_len == 1:
-                # next token
-                attn_output = self.fused_scaled_dot_product_attention(
-                    query_states, key_states, value_states, attention_mask, 0.0, False, None, "None", False, None, "None"
-                )
-            else:
-                softmax_mode = "fast" if flash_attention_fast_softmax else "None"
-
-                # first token
-                if flash_attention_causal_mask:
-                    # causal masking on first token requires inputs to be of the same length
-                    attn_output = self.fused_scaled_dot_product_attention(
-                        query_states, key_states, value_states, None, 0.0, True, None, softmax_mode, flash_attention_recompute, valid_sequence_lengths, "left"
-                    )
-                else:
-                    attn_output = self.fused_scaled_dot_product_attention(
-                        query_states, key_states, value_states, attention_mask, 0.0, False, None, softmax_mode, flash_attention_recompute, None, "None"
-                    )
-        else:
-            query_states, key_states, value_states, attention_mask = gaudi_llama_repeat_kv(
-                query_states, key_states, value_states, attention_mask, self.num_key_value_groups
-            )
-
-            attn_weights = self.matmul_qk(query_states, key_states.transpose(-2, -1)) * self.norm_factor
-
-            if attention_mask is not None:  # no matter the length, we just slice it
-                causal_mask = attention_mask
-                if cache_position is not None:
-                    causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
-                attn_weights = attn_weights + causal_mask
-
-            if attn_softmax_bf16:
-                attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=query_states.dtype)
-            else:
-                # upcast attention to fp32
-                attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
-                    query_states.dtype
-                )
-            attn_weights = torch.nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-            attn_output = self.matmul_av(attn_weights, value_states)
-            attn_output = attn_output.reshape(bsz, -1, q_len, self.head_dim)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        if not reuse_cache and token_idx is not None and cache_idx is not None and q_len == 1:
-            # Return only past key value shapes and not the tensors during decode phase (q len is 1)
-            # to avoid making past key values as persistent output tensors of HPU graphs.
-            past_key_value = (past_key_value[0].shape, past_key_value[1].shape)
-
-        return attn_output, attn_weights, past_key_value
-
-    def attention_all_reduce(self, attn_output):
-        if hasattr(self.o_proj, "all_reduce"):
-            self.o_proj.all_reduce(attn_output)
-
-    def post_attn_forward(self, attn_output):
-        if hasattr(self.o_proj, "post_all_reduce"):
-            self.o_proj.post_all_reduce(attn_output)
-        return attn_output
-
-class TPGaudiLlamaAttention(GaudiLlamaAttention, TPModule):
-    def __init__(
-        self,
-        config: LlamaConfig,
-        layer_idx: Optional[int] = None,
-        group: Optional[ProcessGroup] = None,
-    ):
-        super().__init__(config, layer_idx)
-
-        assert torch.distributed.is_initialized()
-        rank, world_size = distributed.rank_and_world(group)
-        assert config.num_attention_heads % world_size == 0, "The number of heads must be divisible by world size"
-        self.config = copy.deepcopy(config)
-
-        self.pre_tp_kvheads = config.num_key_value_heads
-        GaudiLlamaAttention.__init__(self, self.config, layer_idx)
-        self.config.num_attention_heads = self.config.num_attention_heads // world_size
-        self.config.num_key_value_heads = (
-            (self.config.num_key_value_heads // world_size)
-            if self.config.num_key_value_heads > 1
-            else self.config.num_key_value_heads
-        )
-        self.head_dim = config.hidden_size // config.num_attention_heads
-        self.hidden_size = self.config.hidden_size // world_size
-        self.num_heads = self.config.num_attention_heads
-
-        self.q_proj = torch.nn.Linear(
-            config.hidden_size, self.config.num_attention_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.k_proj = torch.nn.Linear(
-            config.hidden_size, self.config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.v_proj = torch.nn.Linear(
-            config.hidden_size, self.config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.o_proj = torch.nn.Linear(
-            self.config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
-        )
-        self.norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.setup_tp(rank, world_size)
-
-    def colwise_param_names(self) -> List[str]:
-        colwise_weights = ["q_proj"]
-        if self.pre_tp_kvheads != 1:
-            colwise_weights.append("k_proj")
-            colwise_weights.append("v_proj")
-        return colwise_weights
-
-    def rowwise_param_names(self) -> List[str]:
-        return ["o_proj"]
-
-    @staticmethod
-    def import_module(mha: GaudiLlamaAttention, layer_idx, group: ProcessGroup) -> "TPGaudiLlamaAttention":
-        tp_mha = TPGaudiLlamaAttention(config=mha.config, layer_idx=layer_idx, group=group)
-        return tp_mha
-
-    def pre_attn_forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        flash_attention_fast_softmax: Optional[bool] = False,
-        valid_sequence_lengths: torch.Tensor = None,
-        cache_idx: int = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        hidden_states, attn_weights, present_key_value = GaudiLlamaAttention.pre_attn_forward(
-            self,
-            hidden_states,
-            attention_mask,
-            position_ids,
-            past_key_value,
-            output_attentions,
-            use_cache,
-            cache_position,
-            token_idx,
-            attn_softmax_bf16,
-            reuse_cache,
-            use_flash_attention,
-            flash_attention_recompute,
-            flash_attention_causal_mask,
-            flash_attention_fast_softmax,
-            valid_sequence_lengths,
-            cache_idx,
-            **kwargs,
-        )
-
-        hidden_states = reduce_from_tensor_model_parallel_region(hidden_states)
-        return hidden_states, attn_weights, present_key_value
-
-
-class GaudiLlamaDecoderLayer(LlamaDecoderLayer):
-    def __init__(self, config: LlamaConfig, layer_idx: int):
-        super(LlamaDecoderLayer, self).__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = GaudiLlamaAttention(config=config, layer_idx=layer_idx)
-
-        self.mlp = GaudiLlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return self.self_attn.reorder_kv_cache(beam_idx)
-
-    def update_sincos_cache(self, seq_len):
-        self.self_attn.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        flash_attention_fast_softmax: Optional[bool] = False,
-        valid_sequence_lengths: torch.Tensor = None,
-        cache_idx: int = None,
-        num_virtual_tokens: int = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Copied from LlamaDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-        The only differences are:
-        - add new args token_idx
-        - add new args attn_softmax_bf16
-        - add new args reuse_cache
-        - add new args use_flash_attention
-        - add new arg flash_attention_recompute
-        - add new arg flash_attention_causal_mask
-        - add new arg flash_attention_fast_softmax
-        """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        residual = hidden_states
-        hidden_states, self_attn_weights, present_key_value = self.pre_attn(
-            hidden_states,
-            attention_mask,
-            position_ids,
-            past_key_value,
-            output_attentions,
-            use_cache,
-            cache_position,
-            token_idx,
-            attn_softmax_bf16,
-            reuse_cache,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-            flash_attention_fast_softmax=flash_attention_fast_softmax,
-            valid_sequence_lengths=valid_sequence_lengths,
-            cache_idx=cache_idx,
-            num_virtual_tokens=num_virtual_tokens,
-            **kwargs,
-        )
-        self.self_attn.attention_all_reduce(hidden_states)
-        hidden_states, residual = self.post_attn_pre_mlp(hidden_states, residual)
-        self.mlp.mlp_all_reduce(hidden_states)
-        hidden_states = self.post_mlp(hidden_states, residual)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-    def pre_attn(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        flash_attention_fast_softmax: Optional[bool] = False,
-        valid_sequence_lengths: torch.Tensor = None,
-        cache_idx: int = None,
-        num_virtual_tokens: int = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, attn_weights, present_key_value = self.self_attn.pre_attn_forward(
-            hidden_states,
-            attention_mask,
-            position_ids,
-            past_key_value,
-            output_attentions,
-            use_cache,
-            cache_position,
-            token_idx,
-            attn_softmax_bf16,
-            reuse_cache,
-            use_flash_attention,
-            flash_attention_recompute,
-            flash_attention_causal_mask,
-            flash_attention_fast_softmax,
-            valid_sequence_lengths,
-            cache_idx=cache_idx,
-            num_virtual_tokens=num_virtual_tokens,
-        )
-        return hidden_states, attn_weights, present_key_value
-
-    def post_attn_pre_mlp(self, hidden_states, residual):
-        hidden_states = self.self_attn.post_attn_forward(hidden_states)
-
-        if self.training:
-            hidden_states = hidden_states + residual
-            residual = hidden_states
-        else:
-            residual.add_(hidden_states)
-            hidden_states = residual
-
-        hidden_states = self.post_attention_layernorm(hidden_states)
-
-        hidden_states = self.mlp.pre_mlp_forward(hidden_states)
-        return hidden_states, residual
-
-    def post_mlp(self, hidden_states, residual):
-        hidden_states = self.mlp.post_mlp_forward(hidden_states)
-
-        if self.training:
-            hidden_states = hidden_states + residual
-        else:
-            residual.add_(hidden_states)
-            hidden_states = residual
-
-        return hidden_states
-
-
-class GaudiLlamaModel(LlamaModel):
-    """
-    Copied from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L909
-    """
-
-    def __init__(self, config: LlamaConfig):
-        """
-        Copied from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L917
-        1. set fill_value to 1 instead of True
-        2. add device=self.device
-        """
-        super(LlamaModel, self).__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        layers = []
-        for layer_idx in range(config.num_hidden_layers):
-            layer = GaudiLlamaDecoderLayer(config, layer_idx)
-            if config.parallel_strategy is not None:
-                layer = config.parallel_strategy.distribute_layer(layer, layer_idx)
-            layers.append(layer)
-        self.layers = torch.nn.ModuleList(layers)
-        # parallel_strategy is not JSON serializable
-        config.parallel_strategy = None
-
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        for layer in self.layers:
-            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers)
-
-    def update_sincos_cache(self, seq_len):
-        for layer in self.layers:
-            layer.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        flash_attention_fast_softmax: Optional[bool] = False,
-        valid_sequence_lengths: torch.Tensor = None,
-        cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
-        num_virtual_tokens: int = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        """
-        Copied from LlamaModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-        The only differences are:
-        - add new args token_idx
-        - add new args attn_softmax_bf16
-        - add new args reuse_cache
-        - add new args use_flash_attention
-        - add new arg flash_attention_recompute
-        - add new arg flash_attention_causal_mask
-        - add new arg flash_attention_fast_softmax
-        - add new arg lazy_mode
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        if hasattr(self.config, "use_fused_rope") and self.config.use_fused_rope is False:
-            global has_fused_rope
-            has_fused_rope = False
-        if hasattr(self.config, "use_fused_rms_norm") and self.config.use_fused_rms_norm is False:
-            global has_fused_rms_norm
-            has_fused_rms_norm = False
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
-            use_cache = False
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        ignore_cache_position = True  # Ignoring cache position for HPU
-        use_new_cache = False  # Ignoring new Cache path for HPU
-
-        past_seen_tokens = 0
-
-        if past_key_values is not None and use_cache:  # kept for BC (cache positions)
-            if reuse_cache:
-                if isinstance(past_key_values[0][0], torch.Tensor):
-                    past_seen_tokens = past_key_values[0][0].shape[2]
-                else:
-                    past_seen_tokens = past_key_values[0][0][2]
-            else:
-                if use_new_cache:
-                    if not isinstance(past_key_values, StaticCache):
-                        past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                    past_seen_tokens = past_key_values.get_seq_length()
-                else:
-                    past_seen_tokens = past_key_values[0][0].shape[2]
-
-        if ignore_cache_position is False:
-            if cache_position is None:
-                if isinstance(past_key_values, StaticCache):
-                    raise ValueError("cache_position is a required argument when using StaticCache.")
-                cache_position = torch.arange(
-                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-                )
-            if position_ids is None and cache_position:
-                position_ids = cache_position.unsqueeze(0)
-
-        else:
-            if position_ids is None:
-                position_ids = torch.arange(
-                    past_seen_tokens, seq_length + past_seen_tokens, dtype=torch.long, device=inputs_embeds.device
-                )
-                position_ids = position_ids.unsqueeze(0)
-            cache_position = None
-
-        # HPU specific mask generation
-        if ignore_cache_position:
-            causal_mask = _gaudi_prepare_4d_causal_attention_mask(
-                attention_mask,
-                input_ids.shape if input_ids is not None else (batch_size, seq_length),
-                inputs_embeds,
-                past_seen_tokens,
-            )
-        else:
-            causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_seen_tokens)
-
-        # embed positions
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if not use_new_cache else None
-
-        if lazy_mode:
-            htcore.mark_step()
-
-        for layer_idx, decoder_layer in enumerate(self.layers):
-            if (
-                lazy_mode
-                and not self.training
-                and (torch.distributed.is_initialized() is False or torch.distributed.get_world_size() == 1)
-            ):
-                htcore.mark_step()
-
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    None,
-                    attn_softmax_bf16,
-                    False,
-                    use_flash_attention,
-                    flash_attention_recompute,
-                    flash_attention_causal_mask,
-                    flash_attention_fast_softmax,
-                    valid_sequence_lengths,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=None if past_key_values is None else past_key_values[layer_idx],
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    token_idx=token_idx,
-                    attn_softmax_bf16=attn_softmax_bf16,
-                    reuse_cache=reuse_cache,
-                    use_flash_attention=use_flash_attention,
-                    flash_attention_recompute=flash_attention_recompute,
-                    flash_attention_causal_mask=flash_attention_causal_mask,
-                    flash_attention_fast_softmax=flash_attention_fast_softmax,
-                    valid_sequence_lengths=valid_sequence_lengths,
-                    cache_idx=cache_idx,
-                    num_virtual_tokens=num_virtual_tokens,
-                )
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
-            )
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class GaudiLlamaForCausalLM(LlamaForCausalLM):
-    """
-    Inherits from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-    The only differences are:
-    - add new args token_idx
-    - add token_idx into model_inputs
-    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-    - add new args attn_softmax_bf16
-    - add new args reuse_cache
-    """
-
-    def __init__(self, config, parallel_strategy: DistributedStrategy = NoOpStrategy):
-        config.parallel_strategy = parallel_strategy
-        super().__init__(config)
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return self.model.reorder_kv_cache(beam_idx)
-
-    def update_sincos_cache(self, seq_len):
-        self.model.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        trim_logits: Optional[bool] = False,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        flash_attention_fast_softmax: Optional[bool] = False,
-        valid_sequence_lengths: torch.Tensor = None,
-        cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
-        num_virtual_tokens: int = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if self.generation_config.use_fused_rope is False:
-            global has_fused_rope
-            has_fused_rope = False
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            cache_position=cache_position,
-            token_idx=token_idx,
-            attn_softmax_bf16=attn_softmax_bf16,
-            reuse_cache=reuse_cache,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-            flash_attention_fast_softmax=flash_attention_fast_softmax,
-            valid_sequence_lengths=valid_sequence_lengths,
-            cache_idx=cache_idx,
-            lazy_mode=lazy_mode,
-            num_virtual_tokens=num_virtual_tokens,
-        )
-        hidden_states = outputs[0]
-        _, seq_len, _ = hidden_states.shape
-        if seq_len > 1 and trim_logits and not self.training:
-            if token_idx is not None:
-                hidden_states = hidden_states.index_select(1, token_idx - 1)
-            else:
-                hidden_states = hidden_states[:, -1, :]
-
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = torch.nn.CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        token_idx=None,
-        **kwargs,
-    ):
-        past_length = 0
-
-        reuse_cache = kwargs.get("reuse_cache")
-        bucket_internal = kwargs.get("bucket_internal")
-        if past_key_values is not None:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                if isinstance(past_key_values, Cache):
-                    past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
-                    max_cache_length = (
-                        torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
-                        if past_key_values.get_max_length() is not None
-                        else None
-                    )
-                    cache_length = (
-                        past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
-                    )
-                # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-        elif (reuse_cache or bucket_internal) and token_idx is not None:
-            # KV cache is pre allocated with reuse cache or will be padded with bucket internal
-            # hence for the 1st token we can slice the inputs till token idx for the fwd pass.
-            input_ids = input_ids[:, :token_idx]
-            attention_mask = attention_mask[:, :token_idx]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # keep cache_position implementation as None for HPU
-        cache_position = None
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
-            # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
-            # TODO: use `next_tokens` directly instead.
-            model_inputs = {"input_ids": input_ids.contiguous()}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-                "trim_logits": kwargs.get("trim_logits"),
-                "attn_softmax_bf16": kwargs.get("attn_softmax_bf16"),
-                "reuse_cache": reuse_cache,
-                "use_flash_attention": kwargs.get("use_flash_attention"),
-                "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
-                "flash_attention_causal_mask": kwargs.get("flash_attention_causal_mask"),
-                "flash_attention_fast_softmax": kwargs.get("flash_attention_fast_softmax"),
-                "valid_sequence_lengths": kwargs.get("valid_sequence_lengths"),
-                "cache_idx": kwargs.get("cache_idx"),
-                "lazy_mode": kwargs.get("lazy_mode"),
-                "num_virtual_tokens": kwargs.get("num_virtual_tokens"),
-            }
-        )
-        return model_inputs
-
-
-def apply_customized_rope(q, k, cos, sin, position_ids):
-    if q.device.type == "hpu" and has_fused_rope:
-        # TODO: remove `.clone()` when it is fixed in SynapseAI
-        if k.dtype == torch.bfloat16:
-            return FusedRoPE.apply(
-                q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-            ), FusedRoPE.apply(
-                k,
-                cos.unsqueeze(0).unsqueeze(0).clone().to(torch.bfloat16),
-                sin.unsqueeze(0).unsqueeze(0).clone().to(torch.bfloat16),
-                position_ids,
-            )
-        return FusedRoPE.apply(
-            q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-        ), FusedRoPE.apply(
-            k, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-        )
-    else:
-        # keep the same implementation as Transformers v4.37.2
-        return apply_rotary_pos_emb(q, k, cos[position_ids], sin[position_ids])
diff --git a/optimum/habana/transformers/models/llava/__init__.py b/optimum/habana/transformers/models/llava/__init__.py
deleted file mode 100644
index 7059aee3ab..0000000000
--- a/optimum/habana/transformers/models/llava/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modeling_llava import GaudiLlavaForConditionalGeneration
diff --git a/optimum/habana/transformers/models/llava/modeling_llava.py b/optimum/habana/transformers/models/llava/modeling_llava.py
deleted file mode 100644
index f2746c2a3a..0000000000
--- a/optimum/habana/transformers/models/llava/modeling_llava.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Llava model."""
-
-from typing import List, Optional, Tuple, Union
-
-import torch
-from transformers.cache_utils import Cache
-from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast, LlavaForConditionalGeneration
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def _pad_inputs(
-    input_ids, attention_mask, image_token_index, num_patches, pad_token_id, vision_feature_select_strategy=None
-):
-    """
-    pad inputs for static shape
-    """
-
-    if vision_feature_select_strategy == "default":
-        num_patches = num_patches
-    elif vision_feature_select_strategy == "full":
-        num_patches = num_patches + 1
-    else:
-        raise ValueError(f"Unexpected select feature strategy: {vision_feature_select_strategy}")
-    image_offset = 0
-    new_input_ids = []
-    new_attention_mask = []
-    tokens_pos = []
-    for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask):
-        image_token_indices = torch.where(cur_input_ids == image_token_index)[0].tolist() + [cur_input_ids.shape[0]]
-
-        cur_input_ids_extend = []
-        cur_attention_mask_extend = []
-        cur_token_pos = []
-        start = 0
-        for i, image_token_indice in enumerate(image_token_indices):
-            token_pos = len(cur_input_ids_extend)
-            cur_input_ids_extend.extend(cur_input_ids[start:image_token_indice].cpu().tolist())
-            cur_attention_mask_extend.extend(cur_attention_mask[start:image_token_indice].cpu().tolist())
-            cur_token_pos.extend(list(range(token_pos, len(cur_input_ids_extend))))
-            if i != len(image_token_indices) - 1:
-                cur_input_ids_extend.extend([image_token_index] * num_patches)
-                cur_attention_mask_extend.extend([1] * num_patches)
-                cur_token_pos.append(image_token_indice)
-            start = image_token_indice + 1
-
-        new_input_ids.append(cur_input_ids_extend)
-        new_attention_mask.append(cur_attention_mask_extend)
-        tokens_pos.append(cur_token_pos)
-
-    max_len = max(len(x) for x in new_input_ids)
-    image_offset += max_len - input_ids.shape[1]
-
-    # padding
-    new_input_ids_padded = []
-    new_attention_mask_padded = []
-    tokens_pos_padded = []
-
-    # left padding for no image in example, so we don't need change token_idx
-    for cur_new_ids, cur_attention_mask, cur_token_pos in zip(new_input_ids, new_attention_mask, tokens_pos):
-        pad_len = max_len - len(cur_new_ids)
-        new_input_ids_padded.append([pad_token_id] * pad_len + cur_new_ids)
-        new_attention_mask_padded.append([0] * pad_len + cur_attention_mask)
-        tokens_pos_padded.append([x + pad_len for x in cur_token_pos])
-
-    input_ids = torch.tensor(new_input_ids_padded).to(input_ids.device)
-    attention_mask = torch.tensor(new_attention_mask_padded).to(input_ids.device)
-    tokens_pos = torch.tensor(tokens_pos_padded).to(input_ids.device)
-
-    return input_ids, attention_mask, image_offset, tokens_pos
-
-
-def _merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, image_token_index):
-    """
-    Merge text and images
-    """
-    batch_size, sequence_length, embed_dim = inputs_embeds.shape
-
-    batch_indices, image_indices = torch.where(input_ids == image_token_index)
-
-    inputs_embeds[batch_indices, image_indices] = image_features.contiguous().reshape(-1, embed_dim)
-
-    return inputs_embeds.contiguous()
-
-
-class GaudiLlavaForConditionalGeneration(LlavaForConditionalGeneration):
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[int] = None,
-        vision_feature_select_strategy: Optional[str] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        image_offset: Optional[int] = None,
-        tokens_pos: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
-        """
-        Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llava/modeling_llava.py
-        The only differences are:
-        - add new args token_idx
-        - add new args image_offset
-        - add new args tokens_pos
-        """
-
-        if token_idx is not None:
-            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-            output_hidden_states = (
-                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-            )
-            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-            vision_feature_layer = (
-                vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
-            )
-            vision_feature_select_strategy = (
-                vision_feature_select_strategy
-                if vision_feature_select_strategy is not None
-                else self.config.vision_feature_select_strategy
-            )
-
-            # 1. Extra the input embeddings
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-                # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
-                selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-
-                if vision_feature_select_strategy == "default":
-                    selected_image_feature = selected_image_feature[:, 1:]
-                elif vision_feature_select_strategy == "full":
-                    selected_image_feature = selected_image_feature
-                else:
-                    raise ValueError(
-                        f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
-                    )
-
-                image_features = self.multi_modal_projector(selected_image_feature)
-                inputs_embeds = _merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, self.config.image_token_index
-                )
-
-            outputs = self.language_model(
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-                token_idx=token_idx + image_offset,
-            )
-
-            if input_ids.shape[1] != 1 and pixel_values is not None:
-                batch_size, seq_len = tokens_pos.shape
-                batch_indices = torch.arange(batch_size).repeat_interleave(seq_len)
-                logits = outputs[0][batch_indices, tokens_pos.reshape(-1), :].reshape(batch_size, seq_len, -1)
-            else:
-                logits = outputs[0]
-
-            loss = None
-
-            if not return_dict:
-                output = (logits,) + outputs[1:]
-                return (loss,) + output if loss is not None else output
-
-            return LlavaCausalLMOutputWithPast(
-                loss=loss,
-                logits=logits,
-                past_key_values=outputs.past_key_values,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            )
-
-        else:
-            return super().forward(
-                input_ids=input_ids,
-                pixel_values=pixel_values,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                vision_feature_layer=vision_feature_layer,
-                vision_feature_select_strategy=vision_feature_select_strategy,
-                labels=labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
-    ):
-        """
-        Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llava/modeling_llava.py
-        The only differences are:
-        - add new args token_idx
-        - add new args image_offset
-        - add new args tokens_pos
-        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_id
-        """
-        token_idx = kwargs.get("token_idx", None)
-        image_offset = 0
-        tokens_pos = None
-        if token_idx is not None and pixel_values is not None:
-            input_ids, attention_mask, image_offset, tokens_pos = _pad_inputs(
-                input_ids,
-                attention_mask,
-                self.config.image_token_index,
-                self.vision_tower.vision_model.embeddings.num_patches,
-                self.pad_token_id,
-                vision_feature_select_strategy=self.config.vision_feature_select_strategy,
-            )
-
-        past_length = 0
-        if past_key_values is not None:
-            if token_idx is None:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                    # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-                elif self.config.image_token_index in input_ids:
-                    input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-                    # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-                    # older attention values, as their corresponding values are not part of the input.
-                    if cache_length < past_length and attention_mask is not None:
-                        attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-            else:
-                # past_length += token_idx
-                input_ids = torch.index_select(input_ids, 1, token_idx + image_offset - 1)
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "token_idx": token_idx,
-                "image_offset": image_offset,
-                "tokens_pos": tokens_pos,
-            }
-        )
-
-        return model_inputs
diff --git a/optimum/habana/transformers/models/llava_next/__init__.py b/optimum/habana/transformers/models/llava_next/__init__.py
deleted file mode 100644
index d206616106..0000000000
--- a/optimum/habana/transformers/models/llava_next/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modeling_llava_next import GaudiLlavaNextForConditionalGeneration
diff --git a/optimum/habana/transformers/models/llava_next/modeling_llava_next.py b/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
deleted file mode 100644
index 7fd76c5640..0000000000
--- a/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Llava-NeXT model."""
-
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from transformers.models.llava_next.modeling_llava_next import (
-    LlavaNextCausalLMOutputWithPast,
-    LlavaNextForConditionalGeneration,
-    get_anyres_image_grid_shape,
-    unpad_image,
-)
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class GaudiLlavaNextForConditionalGeneration(LlavaNextForConditionalGeneration):
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        image_sizes: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[int] = None,
-        vision_feature_select_strategy: Optional[str] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
-        """
-        Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L433
-        The only differences are:
-        - add new args token_idx
-        - Moved the process of merging images into inputs_embeds into prepare_inputs_for_generation
-        """
-
-        if token_idx is not None:
-            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-            output_hidden_states = (
-                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-            )
-            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-            if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings()(input_ids)
-
-            outputs = self.language_model(
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-                token_idx=token_idx + self.image_offset,
-            )
-
-            if inputs_embeds.shape[1] != 1 and pixel_values is not None:
-                batch_size, seq_len = self.text_tokens_pos.shape
-                batch_indices = torch.arange(batch_size).repeat_interleave(seq_len)
-                logits = outputs[0][batch_indices, self.text_tokens_pos.reshape(-1), :].reshape(
-                    batch_size, seq_len, -1
-                )
-            else:
-                logits = outputs[0]
-
-            loss = None
-            if labels is not None:
-                # Shift so that tokens < n predict n
-                if attention_mask is not None:
-                    shift_attention_mask = attention_mask[..., 1:]
-                    shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
-                    shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
-                else:
-                    shift_logits = logits[..., :-1, :].contiguous()
-                    shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-                loss_fct = nn.CrossEntropyLoss()
-                loss = loss_fct(
-                    shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
-                )
-
-            if not return_dict:
-                output = (logits,) + outputs[1:]
-                return (loss,) + output if loss is not None else output
-
-            return LlavaNextCausalLMOutputWithPast(
-                loss=loss,
-                logits=logits,
-                past_key_values=outputs.past_key_values,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            )
-
-        else:
-            return super().forward(
-                input_ids=input_ids,
-                pixel_values=pixel_values,
-                image_sizes=image_sizes,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                vision_feature_layer=vision_feature_layer,
-                vision_feature_select_strategy=vision_feature_select_strategy,
-                labels=labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-
-    # Copied from https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L356
-    # Remove the step 6: Mask out the embedding at padding positions
-    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
-        num_images, num_image_patches, embed_dim = image_features.shape
-        batch_size, sequence_length = input_ids.shape
-        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
-        # 1. Create a mask to know where special image tokens are
-        special_image_token_mask = input_ids == self.config.image_token_index
-        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
-        # Compute the maximum embed dimension
-        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
-        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
-
-        # 2. Compute the positions where text should be written
-        # Calculate new positions for text tokens in merged image-text sequence.
-        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
-        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
-        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
-        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
-        text_tokens_pos = new_token_positions
-        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
-        if left_padding:
-            new_token_positions += nb_image_pad[:, None]  # offset for left padding
-        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
-
-        # 3. Create the full embedding, already padded to the maximum position
-        final_embedding = torch.zeros(
-            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
-        )
-        final_attention_mask = torch.zeros(
-            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
-        )
-
-        if labels is not None:
-            final_labels = torch.full(
-                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
-            )
-        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
-        # set the corresponding tensors into their correct target device.
-        target_device = inputs_embeds.device
-        batch_indices, non_image_indices, text_to_overwrite = (
-            batch_indices.to(target_device),
-            non_image_indices.to(target_device),
-            text_to_overwrite.to(target_device),
-        )
-        attention_mask = attention_mask.to(target_device)
-
-        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
-        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
-        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
-        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
-        if labels is not None:
-            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
-
-        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
-        image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
-
-        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
-            raise ValueError(
-                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
-                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
-            )
-
-        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
-        final_attention_mask |= image_to_overwrite
-        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
-
-        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
-        # batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
-        # indices_to_mask = new_token_positions[batch_indices, pad_indices]
-
-        # final_embedding[batch_indices, indices_to_mask] = 0
-        if labels is None:
-            final_labels = None
-
-        return final_embedding, final_attention_mask, final_labels, position_ids, text_tokens_pos
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        inputs_embeds=None,
-        pixel_values=None,
-        image_sizes=None,
-        attention_mask=None,
-        **kwargs,
-    ):
-        """
-        Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L635
-        The only differences are:
-        - add new args token_idx
-        - add the process of merging images into inputs_embeds
-        """
-        token_idx = kwargs.get("token_idx", None)
-        if token_idx is None:
-            return super().prepare_inputs_for_generation(
-                input_ids=input_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                pixel_values=pixel_values,
-                image_sizes=image_sizes,
-                attention_mask=attention_mask,
-                **kwargs,
-            )
-        else:
-            position_ids = kwargs.get("position_ids", None)
-            labels = kwargs.get("labels", None)
-            if past_key_values is None and pixel_values is not None and input_ids.shape[1] != 1:
-                vision_feature_select_strategy = kwargs.get("vision_feature_select_strategy", None)
-                vision_feature_layer = kwargs.get("vision_feature_layer", None)
-                vision_feature_select_strategy = (
-                    vision_feature_select_strategy
-                    if vision_feature_select_strategy is not None
-                    else self.config.vision_feature_select_strategy
-                )
-                vision_feature_layer = (
-                    vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
-                )
-
-                # 1. Extract the input embeddings
-                inputs_embeds = self.get_input_embeddings()(input_ids)
-                # 2. Merge text and images
-                batch_size, num_patches, num_channels, height, width = pixel_values.shape
-                reshaped_pixel_values = pixel_values.view(batch_size * num_patches, num_channels, height, width)
-                image_features = self.vision_tower(reshaped_pixel_values, output_hidden_states=True)
-
-                selected_image_feature = image_features.hidden_states[vision_feature_layer]
-
-                if vision_feature_select_strategy == "default":
-                    selected_image_feature = selected_image_feature[:, 1:]
-                elif vision_feature_select_strategy == "full":
-                    selected_image_feature = selected_image_feature
-
-                image_features = self.multi_modal_projector(selected_image_feature)
-
-                # split up image_features for each of the individual images
-                # hence we get a list of image_features, each of shape (5, num_patches, hidden_size)
-                # if we assume each image has 5 image features (base image + 4 patches)
-                split_sizes = [image.shape[0] for image in pixel_values]
-                image_features = torch.split(image_features, split_sizes, dim=0)
-
-                # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
-                height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
-
-                new_image_features = []
-                for image_idx, image_feature in enumerate(image_features):
-                    if image_feature.shape[0] > 1:
-                        base_image_feature = image_feature[0]
-                        image_feature = image_feature[1:]
-
-                        if height * width != base_image_feature.shape[0]:
-                            raise ValueError("The number of patches is not consistent with the image size.")
-                        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-                            image_sizes[image_idx],
-                            self.config.image_grid_pinpoints,
-                            self.config.vision_config.image_size,
-                        )
-                        image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
-                        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
-                        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-                        image_feature = unpad_image(image_feature, image_sizes[image_idx])
-                        image_feature = torch.cat(
-                            (
-                                image_feature,
-                                self.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1),
-                            ),
-                            dim=-1,
-                        )
-                        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
-                    else:
-                        image_feature = image_feature[0]
-                        image_feature = torch.cat((image_feature, self.image_newline[None]), dim=0)
-                    new_image_features.append(image_feature)
-                image_features = torch.stack(new_image_features, dim=0)
-                inputs_embeds, attention_mask, labels, position_ids, self.text_tokens_pos = (
-                    self._merge_input_ids_with_image_features(
-                        image_features, inputs_embeds, input_ids, attention_mask, labels
-                    )
-                )
-                self.image_offset = image_features.shape[1] - 1  # image_token has occupied 1 token position.
-                if labels is None:
-                    labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
-
-            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-            # generation with cache
-            elif past_key_values is not None and pixel_values is not None:
-                seq_len = input_ids.shape[1]
-                pad_len = seq_len - token_idx
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = extended_attention_mask
-                attention_mask[:, -pad_len:] = 0
-
-            if attention_mask is not None and position_ids is None:
-                # create position_ids on the fly for batch generation
-                position_ids = attention_mask.long().cumsum(-1) - 1
-                position_ids.masked_fill_(attention_mask == 0, 1)
-                if past_key_values:
-                    if token_idx is not None:
-                        position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-                    else:
-                        position_ids = position_ids[:, -input_ids.shape[1] :]
-
-            # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-            if inputs_embeds is not None and past_key_values is None:
-                model_inputs = {"inputs_embeds": inputs_embeds}
-            else:
-                model_inputs = {"input_ids": input_ids}
-
-            model_inputs.update(
-                {
-                    "position_ids": position_ids,
-                    "past_key_values": past_key_values,
-                    "use_cache": kwargs.get("use_cache"),
-                    "attention_mask": attention_mask,
-                    "pixel_values": pixel_values,
-                    "token_idx": token_idx,
-                    "image_sizes": image_sizes,
-                    "labels": labels,
-                }
-            )
-
-            return model_inputs
diff --git a/optimum/habana/transformers/models/mistral/__init__.py b/optimum/habana/transformers/models/mistral/__init__.py
deleted file mode 100644
index a4cdf47bff..0000000000
--- a/optimum/habana/transformers/models/mistral/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .configuration_mistral import MistralConfig
-from .modeling_mistral import (
-    GaudiMistralAttention,
-    GaudiMistralDecoderLayer,
-    GaudiMistralForCausalLM,
-    GaudiMistralModel,
-    gaudi_mistral_rmsnorm_forward,
-)
diff --git a/optimum/habana/transformers/models/mistral/configuration_mistral.py b/optimum/habana/transformers/models/mistral/configuration_mistral.py
deleted file mode 100644
index 6e5968a00f..0000000000
--- a/optimum/habana/transformers/models/mistral/configuration_mistral.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from transformers.models.mistral.configuration_mistral import MistralConfig
-
-
-class MistralConfig(MistralConfig):
-    """
-    Copied from: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/mistral/configuration_mistral.py#L29
-    Changes:
-    - add `rope_scaling` and `_rope_scaling_validation` (inspired from Llama)
-    """
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=4096 * 32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        sliding_window=4096,
-        attention_dropout=0.0,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_size,
-            hidden_size,
-            intermediate_size,
-            num_hidden_layers,
-            num_attention_heads,
-            num_key_value_heads,
-            hidden_act,
-            max_position_embeddings,
-            initializer_range,
-            rms_norm_eps,
-            use_cache,
-            pad_token_id,
-            bos_token_id,
-            eos_token_id,
-            tie_word_embeddings,
-            rope_theta,
-            sliding_window,
-            attention_dropout,
-            **kwargs,
-        )
-
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-    def _rope_scaling_validation(self):
-        """
-        Taken from: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/configuration_llama.py#L172
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/optimum/habana/transformers/models/mistral/modeling_mistral.py b/optimum/habana/transformers/models/mistral/modeling_mistral.py
deleted file mode 100644
index 29cd5db92a..0000000000
--- a/optimum/habana/transformers/models/mistral/modeling_mistral.py
+++ /dev/null
@@ -1,901 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Mistral model."""
-
-import math
-import os
-from typing import List, Optional, Tuple, Union
-
-import habana_frameworks.torch.core as htcore
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.mistral.configuration_mistral import MistralConfig
-from transformers.models.mistral.modeling_mistral import (
-    MistralAttention,
-    MistralDecoderLayer,
-    MistralForCausalLM,
-    MistralMLP,
-    MistralModel,
-    MistralRMSNorm,
-    apply_rotary_pos_emb,
-)
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-from ..llama.modeling_llama import (
-    GaudiLlamaDynamicNTKScalingRotaryEmbedding,
-    GaudiLlamaLinearScalingRotaryEmbedding,
-    GaudiLlamaRotaryEmbedding,
-)
-
-
-try:
-    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
-
-    has_fused_rope = True
-except ImportError:
-    has_fused_rope = False
-    print("Not using HPU fused kernel for apply_rotary_pos_emb")
-
-try:
-    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
-except ImportError:
-    print("Not using HPU fused kernel for RMSNorm")
-    FusedRMSNorm = None
-
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-except ImportError:
-    print("Not using HPU fused scaled dot-product attention kernel.")
-    FusedSDPA = None
-
-logger = logging.get_logger(__name__)
-
-
-class KVCache(torch.nn.Module):
-    def __init__(self):
-        super(KVCache, self).__init__()
-        self.cache = None
-        self.inp_seq_len = -1
-
-    def allocate(self, inp_seq_len, dtype, device, shape):
-        if self.cache is None or self.cache.shape != shape:
-            self.inp_seq_len = inp_seq_len
-            self.cache = torch.zeros(shape, dtype=dtype, device=device)
-        else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
-            self.cache.fill_(0)
-
-    def update(self, prev, cur, dim, idx, inp_seq_len):
-        orig_cur = cur
-        if prev.shape == cur.shape:
-            prev.copy_(cur)
-            return orig_cur
-        if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
-            # Initialize
-            prev[:, :, :inp_seq_len, :].copy_(cur)
-            return orig_cur
-        assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
-        if idx is not None:
-            prev.index_copy_(dim, idx - 1, cur)
-            return prev
-        else:
-            return torch.cat((prev, cur), dim=dim)
-
-    def get_shape(self):
-        if self.cache is None:
-            return None
-        return self.cache.shape
-
-    def forward(self, cur, dim, idx):
-        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
-
-
-class ModuleFusedSDPA(torch.nn.Module):
-    def __init__(self, fusedSDPA):
-        super().__init__()
-        self._hpu_kernel_fsdpa = fusedSDPA
-
-    def forward(self, query, key, value, attn_mask, dropout_p, is_casual, scale):
-        return self._hpu_kernel_fsdpa.apply(query, key, value, attn_mask, dropout_p, is_casual, scale)
-
-
-class Matmul(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
-def gaudi_mistral_repeat_kv(
-    query_states: torch.Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
-    attention_mask: torch.Tensor,
-    n_rep: int,
-):
-    """
-    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/mistral/modeling_mistral.py
-    The only differences are:
-        - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
-        - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
-    if n_rep == 1 or num_key_value_heads == 1:
-        return query_states, key_states, value_states, attention_mask
-
-    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
-    key_states = key_states.reshape(new_kv_shape)
-    value_states = value_states.reshape(new_kv_shape)
-
-    batch, _, q_len, head_dim = query_states.shape
-    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
-    query_states = query_states.reshape(new_q_shape)
-
-    if attention_mask is not None:
-        # Add groups dim and set to 1
-        attention_mask = attention_mask.unsqueeze(1)
-
-    return query_states, key_states, value_states, attention_mask
-
-
-def gaudi_mistral_rmsnorm_forward(self, hidden_states):
-    """
-    Copied from MistralRMSNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py
-    The only differences are:
-        - override RMSNorm with Habana fused RMSNorm
-    """
-    if hidden_states.device.type == "hpu" and FusedRMSNorm:
-        # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
-        if hidden_states.dtype != self.weight.dtype:
-            orig_dtype = hidden_states.dtype
-            hidden_states = FusedRMSNorm.apply(hidden_states.to(self.weight.dtype), self.weight, self.variance_epsilon)
-            return hidden_states.to(orig_dtype)
-        else:
-            hidden_states = FusedRMSNorm.apply(hidden_states, self.weight, self.variance_epsilon)
-            return hidden_states
-    else:
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class GaudiMistralAttention(MistralAttention):
-    def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-        config.rope_scaling = config.rope_scaling if hasattr(config, "rope_scaling") else None
-        self.config = config
-        self.k_cache = KVCache()
-        self.v_cache = KVCache()
-        self.matmul_qk = Matmul()
-        self.matmul_av = Matmul()
-        self.fused_scaled_dot_product_attention = ModuleFusedSDPA(FusedSDPA) if FusedSDPA else None
-        self.inp_seq_len = -1
-        self._init_rope()
-        self.norm_factor = 1.0 / math.sqrt(self.head_dim)
-
-    def _init_rope(self):
-        """
-        Copied from: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L294
-        """
-        if self.config.rope_scaling is None:
-            self.rotary_emb = GaudiLlamaRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = GaudiLlamaLinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = GaudiLlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim)
-        device = self.k_proj.weight.device
-        dtype = self.config.torch_dtype
-        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-
-    def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
-        # This helps in avoiding creation of these caches during actual model forward pass and
-        # reduce memory consumption and improve performance.
-        if seq_len > self.max_position_embeddings:
-            self.max_position_embeddings = seq_len
-            _, _ = self.rotary_emb(self.k_proj.weight, seq_len=seq_len)
-
-    def reorder(self, tensor, beam_idx, dim_a, dim_b):
-        updated = tensor.index_select(0, beam_idx)
-        tensor.copy_(updated)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        if self.k_cache.cache is None:
-            return (None, None)
-
-        head_dim = self.k_cache.cache.size(-1)
-        seq_length = self.k_cache.cache.size(-2)
-        self.reorder(self.k_cache.cache, beam_idx, seq_length, head_dim)
-        self.reorder(self.v_cache.cache, beam_idx, seq_length, head_dim)
-        return (self.k_cache.cache.shape, self.v_cache.cache.shape)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: Optional[int] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """
-         Copied from MistralAttention.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
-         The only differences are:
-         - add new args token_idx
-         - add new args reuse_cache
-        - add new args cache_idx
-        """
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_shape = (
-                (past_key_value[0][-2] if reuse_cache else past_key_value[0].shape[-2])
-                if isinstance(past_key_value, tuple)
-                else past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            )
-            if token_idx is not None:
-                kv_seq_len = kv_shape
-            else:
-                kv_seq_len += kv_shape
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
-
-        if use_cache:
-            # reuse k, v, self_attention
-            if reuse_cache:
-                key_states = self.k_cache(key_states, 2, token_idx)
-                value_states = self.v_cache(value_states, 2, token_idx)
-                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
-            else:
-                if past_key_value is None:
-                    past_key = torch.zeros(key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device)
-                    past_value = torch.zeros(
-                        key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device
-                    )
-                    past_key_value = (past_key, past_value)
-                key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
-                value_states = self.v_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
-                if token_idx is None:
-                    past_key_value = (key_states, value_states)
-
-            if cache_idx is not None and q_len == 1:
-                key_states = key_states[:, :, :cache_idx, :]
-                value_states = value_states[:, :, :cache_idx, :]
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, :, :, :cache_idx]
-                kv_seq_len = key_states.shape[-2]
-        else:
-            past_key_value = None
-
-        import habana_frameworks.torch.hpu as ht
-
-        if FusedSDPA and use_flash_attention:
-            if q_len == 1:
-                # next token
-                use_recompute = True if os.getenv("QUANT_CONFIG", "") else False
-                with ht.sdp_kernel(enable_recompute=use_recompute):
-                    attn_output = self.fused_scaled_dot_product_attention(
-                        query_states, key_states, value_states, attention_mask, 0.0, False, None
-                    )
-            else:
-                # first token
-                if flash_attention_causal_mask:
-                    # causal masking on first token requires inputs to be of the same length
-                    with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
-                        attn_output = self.fused_scaled_dot_product_attention(
-                            query_states, key_states, value_states, None, 0.0, True, None
-                        )
-                else:
-                    with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
-                        attn_output = self.fused_scaled_dot_product_attention(
-                            query_states, key_states, value_states, attention_mask, 0.0, False, None
-                        )
-
-        else:
-            # repeat k/v heads if n_kv_heads < n_heads
-            query_states, key_states, value_states, attention_mask = gaudi_mistral_repeat_kv(
-                query_states, key_states, value_states, attention_mask, self.num_key_value_groups
-            )
-            attn_weights = self.matmul_qk(query_states, key_states.transpose(-2, -1)) * self.norm_factor
-
-            if attn_weights.size() not in [
-                (bsz, self.num_heads, q_len, kv_seq_len),
-                (bsz, self.num_key_value_heads, self.num_key_value_groups, q_len, kv_seq_len),
-            ]:
-                raise ValueError(
-                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)} or"
-                    f" {(bsz, self.num_key_value_heads, self.num_key_value_groups, q_len, kv_seq_len)}, but is"
-                    f" {attn_weights.size()}"
-                )
-
-            if attention_mask is not None:
-                if attention_mask.size() not in [(bsz, 1, q_len, kv_seq_len), (bsz, 1, 1, q_len, kv_seq_len)]:
-                    raise ValueError(
-                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)} or {(bsz, 1, 1, q_len, kv_seq_len)},"
-                        f" but is {attention_mask.size()}"
-                    )
-
-                attn_weights = attn_weights + attention_mask
-
-            if attn_softmax_bf16:
-                attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=query_states.dtype)
-            else:
-                # upcast attention to fp32
-                attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-            attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-            attn_output = self.matmul_av(attn_weights, value_states)
-            attn_output = attn_output.reshape(bsz, -1, q_len, self.head_dim)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class GaudiMistralDecoderLayer(MistralDecoderLayer):
-    def __init__(self, config: MistralConfig, layer_idx: int):
-        super(MistralDecoderLayer, self).__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = GaudiMistralAttention(config, layer_idx)
-
-        self.mlp = MistralMLP(config)
-        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return self.self_attn.reorder_kv_cache(beam_idx)
-
-    def update_sincos_cache(self, seq_len):
-        self.self_attn.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: Optional[int] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Copied from MistralDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
-        The only differences are:
-        - add new args token_idx
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            token_idx=token_idx,
-            reuse_cache=reuse_cache,
-            cache_idx=cache_idx,
-            attn_softmax_bf16=attn_softmax_bf16,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-class GaudiMistralModel(MistralModel):
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        for layer in self.layers:
-            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers)
-
-    def update_sincos_cache(self, seq_len):
-        for layer in self.layers:
-            layer.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: Optional[int] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        lazy_mode: Optional[bool] = True,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        """
-        Copied from MistralModel.forward: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
-        The only differences are:
-        - add new args token_idx
-        - add new arg lazy_mode
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-        use_legacy_cache = True
-        use_new_cache = False
-        if past_key_values is not None and use_cache and not reuse_cache and use_new_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if not use_new_cache else None
-
-        if lazy_mode:
-            htcore.mark_step()
-
-        for layer_idx, decoder_layer in enumerate(self.layers):
-            if layer_idx == len(self.layers) // 2 or (
-                lazy_mode
-                and not self.training
-                and (torch.distributed.is_initialized() is False or torch.distributed.get_world_size() == 1)
-            ):
-                htcore.mark_step()
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None if past_key_values is None else past_key_values[layer_idx],
-                    output_attentions,
-                    use_cache,
-                    None,
-                    False,
-                    cache_idx,
-                    attn_softmax_bf16,
-                    use_flash_attention,
-                    flash_attention_recompute,
-                    flash_attention_causal_mask,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=None if past_key_values is None else past_key_values[layer_idx],
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    token_idx=token_idx,
-                    reuse_cache=reuse_cache,
-                    cache_idx=cache_idx,
-                    attn_softmax_bf16=attn_softmax_bf16,
-                    use_flash_attention=use_flash_attention,
-                    flash_attention_recompute=flash_attention_recompute,
-                    flash_attention_causal_mask=flash_attention_causal_mask,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = (
-                next_decoder_cache
-                if not use_new_cache
-                else (next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache)
-            )
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class GaudiMistralForCausalLM(MistralForCausalLM):
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return self.model.reorder_kv_cache(beam_idx)
-
-    def update_sincos_cache(self, seq_len):
-        self.model.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        trim_logits: Optional[bool] = False,
-        cache_idx: Optional[int] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        lazy_mode: Optional[bool] = True,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        """
-        Inherits from MistralForCausalLM: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
-        The only differences are:
-        - add new args token_idx
-        """
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.generation_config.use_fused_rope is False:
-            global has_fused_rope
-            has_fused_rope = False
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-            reuse_cache=reuse_cache,
-            cache_idx=cache_idx,
-            attn_softmax_bf16=attn_softmax_bf16,
-            lazy_mode=lazy_mode,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-        )
-        hidden_states = outputs[0]
-        _, seq_len, _ = hidden_states.shape
-        if seq_len > 1 and trim_logits and not self.training:
-            if token_idx is not None:
-                hidden_states = hidden_states.index_select(1, token_idx - 1)
-            else:
-                hidden_states = hidden_states[:, -1, :]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        """
-        Inherits from MistralForCausalLM: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/mistral/modeling_mistral.py
-        The only differences are:
-        - add new args token_idx
-        - add token_idx into model_inputs
-        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-        """
-        reuse_cache = kwargs.get("reuse_cache", False)
-        token_idx = kwargs.get("token_idx", None)
-
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if token_idx is None:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                    max_cache_length = past_key_values.get_max_length()
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-            else:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-        elif reuse_cache and token_idx is not None:
-            # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass
-            input_ids = input_ids[:, :token_idx]
-            attention_mask = attention_mask[:, :token_idx]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-                "reuse_cache": kwargs.get("reuse_cache"),
-                "trim_logits": kwargs.get("trim_logits"),
-                "cache_idx": kwargs.get("cache_idx"),
-                "attn_softmax_bf16": kwargs.get("attn_softmax_bf16"),
-                "lazy_mode": kwargs.get("lazy_mode"),
-                "use_flash_attention": kwargs.get("use_flash_attention"),
-                "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
-                "flash_attention_causal_mask": kwargs.get("flash_attention_causal_mask"),
-            }
-        )
-        return model_inputs
-
-
-def apply_customized_rope(q, k, cos, sin, position_ids):
-    if q.device.type == "hpu" and has_fused_rope:
-        # TODO: remove `.clone()` when SynapseAI v1.15 is released
-        if k.dtype == torch.bfloat16:
-            return FusedRoPE.apply(
-                q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-            ), FusedRoPE.apply(
-                k,
-                cos.unsqueeze(0).unsqueeze(0).clone().to(torch.bfloat16),
-                sin.unsqueeze(0).unsqueeze(0).clone().to(torch.bfloat16),
-                position_ids,
-            )
-        return FusedRoPE.apply(
-            q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-        ), FusedRoPE.apply(
-            k, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-        )
-    else:
-        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
diff --git a/optimum/habana/transformers/models/mixtral/__init__.py b/optimum/habana/transformers/models/mixtral/__init__.py
deleted file mode 100644
index ab34977c37..0000000000
--- a/optimum/habana/transformers/models/mixtral/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .configuration_mixtral import MixtralConfig
-from .modeling_mixtral import (
-    GaudiMixtralAttention,
-    GaudiMixtralDecoderLayer,
-    GaudiMixtralForCausalLM,
-    GaudiMixtralModel,
-    gaudi_mixtral_block_sparse_moe_forward,
-    gaudi_mixtral_rmsnorm_forward,
-)
diff --git a/optimum/habana/transformers/models/mixtral/configuration_mixtral.py b/optimum/habana/transformers/models/mixtral/configuration_mixtral.py
deleted file mode 100644
index 90a783f0d1..0000000000
--- a/optimum/habana/transformers/models/mixtral/configuration_mixtral.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from transformers.models.mixtral.configuration_mixtral import MixtralConfig
-
-
-class MixtralConfig(MixtralConfig):
-    """
-    Copied from: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/mixtral/configuration_mixtral.py#L28
-    Changes:
-    - add `rope_scaling` and `_rope_scaling_validation` (inspired from Llama)
-    """
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=4096 * 32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=1e6,
-        sliding_window=None,
-        attention_dropout=0.0,
-        num_experts_per_tok=2,
-        num_local_experts=8,
-        output_router_logits=False,
-        router_aux_loss_coef=0.001,
-        router_jitter_noise=0.0,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_size,
-            hidden_size,
-            intermediate_size,
-            num_hidden_layers,
-            num_attention_heads,
-            num_key_value_heads,
-            hidden_act,
-            max_position_embeddings,
-            initializer_range,
-            rms_norm_eps,
-            use_cache,
-            pad_token_id,
-            bos_token_id,
-            eos_token_id,
-            tie_word_embeddings,
-            rope_theta,
-            sliding_window,
-            attention_dropout,
-            num_experts_per_tok,
-            num_local_experts,
-            output_router_logits,
-            router_aux_loss_coef,
-            router_jitter_noise,
-            **kwargs,
-        )
-
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-    def _rope_scaling_validation(self):
-        """
-        Taken from: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/configuration_llama.py#L172
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
deleted file mode 100644
index 883409a28f..0000000000
--- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
+++ /dev/null
@@ -1,917 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""PyTorch Mixtral model."""
-
-import contextlib
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import habana_frameworks.torch.core as htcore
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
-from transformers.integrations.deepspeed import is_deepspeed_available
-from transformers.modeling_attn_mask_utils import (
-    _prepare_4d_causal_attention_mask,
-    _prepare_4d_causal_attention_mask_for_sdpa,
-)
-from transformers.modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
-from transformers.models.mixtral.modeling_mixtral import (
-    MixtralAttention,
-    MixtralDecoderLayer,
-    MixtralForCausalLM,
-    MixtralModel,
-    apply_rotary_pos_emb,
-    load_balancing_loss_func,
-)
-from transformers.utils import logging
-
-from ..llama.modeling_llama import (
-    GaudiLlamaDynamicNTKScalingRotaryEmbedding,
-    GaudiLlamaLinearScalingRotaryEmbedding,
-    GaudiLlamaRotaryEmbedding,
-)
-from .configuration_mixtral import MixtralConfig
-
-
-try:
-    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
-except ImportError:
-    print("Not using HPU fused kernel for apply_rotary_pos_emb")
-    FusedRoPE = None
-
-try:
-    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm
-except ImportError:
-    print("Not using HPU fused kernel for RMSNorm")
-    FusedRMSNorm = None
-
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-except ImportError:
-    print("Not using HPU fused scaled dot-product attention kernel.")
-    FusedSDPA = None
-
-try:
-    from habana_frameworks.torch.hpu import sdp_kernel
-
-    SDPContext = True
-except ImportError:
-    SDPContext = False
-
-logger = logging.get_logger(__name__)
-
-
-def apply_customized_rope(q, k, cos, sin, position_ids):
-    if q.device.type == "hpu" and FusedRoPE:
-        return FusedRoPE.apply(
-            q, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids
-        ), FusedRoPE.apply(k, cos.unsqueeze(0).unsqueeze(0), sin.unsqueeze(0).unsqueeze(0), position_ids)
-    else:
-        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
-
-
-def gaudi_mixtral_rmsnorm_forward(self, hidden_states):
-    """
-    Copied from MixtralRMSNorm.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
-    The only differences are:
-        - override RMSNorm with Habana fused RMSNorm
-    """
-    if hidden_states.device.type == "hpu" and FusedRMSNorm:
-        # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
-        if hidden_states.dtype != self.weight.dtype:
-            orig_dtype = hidden_states.dtype
-            hidden_states = FusedRMSNorm.apply(hidden_states.to(self.weight.dtype), self.weight, self.variance_epsilon)
-            return hidden_states.to(orig_dtype)
-        else:
-            hidden_states = FusedRMSNorm.apply(hidden_states, self.weight, self.variance_epsilon)
-            return hidden_states
-    else:
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-def gaudi_mixtral_repeat_kv(
-    query_states: torch.Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
-    attention_mask: torch.Tensor,
-    n_rep: int,
-):
-    """
-    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
-    The only differences are:
-    - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
-    - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
-    if n_rep == 1 or num_key_value_heads == 1:
-        return query_states, key_states, value_states, attention_mask
-
-    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
-    key_states = key_states.reshape(new_kv_shape)
-    value_states = value_states.reshape(new_kv_shape)
-
-    batch, _, q_len, head_dim = query_states.shape
-    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
-    query_states = query_states.reshape(new_q_shape)
-
-    if attention_mask is not None:
-        # Add groups dim and set to 1
-        attention_mask = attention_mask.unsqueeze(1)
-
-    return query_states, key_states, value_states, attention_mask
-
-
-class KVCache(torch.nn.Module):
-    def __init__(self):
-        super(KVCache, self).__init__()
-        self.cache = None
-        self.inp_seq_len = -1
-
-    def allocate(self, inp_seq_len, dtype, device, shape):
-        if self.cache is None or self.cache.shape != shape:
-            self.inp_seq_len = inp_seq_len
-            self.cache = torch.zeros(shape, dtype=dtype, device=device)
-        else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
-            self.cache.fill_(0)
-
-    def update(self, prev, cur, dim, idx, inp_seq_len):
-        orig_cur = cur
-        if prev.shape == cur.shape:
-            prev.copy_(cur)
-            return orig_cur
-        if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
-            # Initialize
-            prev[:, :, :inp_seq_len, :].copy_(cur)
-            return orig_cur
-        assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
-        if idx is not None:
-            prev.index_copy_(dim, idx - 1, cur)
-            return prev
-        else:
-            return torch.cat((prev, cur), dim=dim)
-
-    def get_shape(self):
-        if self.cache is None:
-            return None
-        return self.cache.shape
-
-    def forward(self, cur, dim, idx):
-        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
-
-
-class GaudiMixtralAttentionLongSequence:
-    @staticmethod
-    def forward(q, k, v, mask, causal, q_block_size):
-        """
-        Support long sequence at prompt phase
-        """
-        q_len = q.size(-2)
-        q_tiles = (q_len // q_block_size) if (q_len % q_block_size == 0) else math.ceil(q_len / q_block_size)
-        q_padding = q_tiles * q_block_size - q_len
-        q = F.pad(q, (0, 0, 0, q_padding), "constant", 0)
-        if mask is not None:
-            mask = F.pad(mask, (0, 0, 0, q_padding), "constant", -10000.0)
-        attn_output = torch.zeros_like(q)
-
-        for i in range(q_tiles):
-            s, e = i * q_block_size, (i + 1) * q_block_size
-            row_q = q[:, :, s:e, :]
-            row_mask = mask[:, :, s:e, :]
-            row_o = attn_output[:, :, s:e, :]
-            row_o.fill_(FusedSDPA.apply(row_q, k, v, row_mask, 0.0, causal, None))
-
-        if q_padding != 0:
-            attn_output = attn_output[:, :, :-q_padding, :]
-
-        return attn_output
-
-
-class GaudiMixtralAttention(MixtralAttention):
-    def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-        config.rope_scaling = config.rope_scaling if hasattr(config, "rope_scaling") else None
-        self.config = config
-        self._init_rope()
-        self.k_cache = KVCache()
-        self.v_cache = KVCache()
-        self.inp_seq_len = -1
-        self.norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.block_size = 1024
-
-    def _init_rope(self):
-        """
-        Copied from: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L294
-        """
-        if self.config.rope_scaling is None:
-            self.rotary_emb = GaudiLlamaRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = GaudiLlamaLinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = GaudiLlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim)
-        device = self.k_proj.weight.device
-        dtype = self.config.torch_dtype
-        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        cache_idx: int = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """
-        Copied from MixtralAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
-        The only differences are:
-        - add new args token_idx
-        - optimize KV cache
-        - add new args reuse_cache
-        - add new args flash_attention_recompute
-        - add new args cache_idx
-        """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            if token_idx is None:
-                if hasattr(past_key_value, "get_usable_length"):
-                    kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-                else:
-                    kv_seq_len += past_key_value[0].shape[-2]
-            else:
-                if reuse_cache:
-                    kv_seq_len = past_key_value[0][-2]
-                else:
-                    kv_seq_len = past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
-
-        if use_cache:
-            if reuse_cache:
-                key_states = self.k_cache(key_states, 2, token_idx)
-                value_states = self.v_cache(value_states, 2, token_idx)
-                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
-            else:
-                if past_key_value is None:
-                    past_key = torch.zeros(key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device)
-                    past_value = torch.zeros(
-                        key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device
-                    )
-                    past_key_value = (past_key, past_value)
-                key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
-                value_states = self.k_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
-                if token_idx is None:
-                    past_key_value = (key_states, value_states)
-
-            if cache_idx is not None and q_len == 1:
-                key_states = key_states[:, :, :cache_idx, :]
-                value_states = value_states[:, :, :cache_idx, :]
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, :, :, :cache_idx]
-                kv_seq_len = key_states.shape[-2]
-        else:
-            past_key_value = None
-
-        if FusedSDPA:
-            if query_states.dtype != key_states.dtype:
-                key_states = key_states.type(query_states.dtype)
-                value_states = value_states.type(query_states.dtype)
-            # support long sequences exceeding 8192
-            if not self.training and q_len == key_states.size(-2) and q_len > 8192:
-                htcore.mark_step()
-                attn_output = GaudiMixtralAttentionLongSequence.forward(
-                    query_states,
-                    key_states,
-                    value_states,
-                    attention_mask,
-                    False,
-                    self.block_size,
-                )
-                htcore.mark_step()
-            else:
-                with sdp_kernel(
-                    enable_recompute=flash_attention_recompute
-                ) if SDPContext else contextlib.nullcontext():
-                    attn_output = FusedSDPA.apply(
-                        query_states, key_states, value_states, attention_mask, 0.0, False, None
-                    )
-        else:
-            query_states, key_states, value_states, attention_mask = gaudi_mixtral_repeat_kv(
-                query_states, key_states, value_states, attention_mask, self.num_key_value_groups
-            )
-
-            attn_weights = torch.matmul(query_states, key_states.transpose(-2, -1)) * self.norm_factor
-
-            if attention_mask is not None:
-                attention_mask = attention_mask.unsqueeze(2)
-                attn_weights = attn_weights + attention_mask
-
-            # upcast attention to fp32
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-            attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-            attn_output = torch.matmul(attn_weights, value_states)
-
-            attn_output = attn_output.reshape(bsz, self.num_heads, q_len, self.head_dim).contiguous()
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-def gaudi_mixtral_block_sparse_moe_forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Copied from MixtralSparseMoeBlock.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
-    The only differences are:
-    - optimize expert forward, remove dynamic control and dynamic shape
-    """
-    batch_size, sequence_length, hidden_dim = hidden_states.shape
-    hidden_states = hidden_states.view(-1, hidden_dim)
-    # router_logits: (batch * sequence_length, n_experts)
-    router_logits = self.gate(hidden_states)
-
-    if is_deepspeed_available() and (not self.training):
-        from deepspeed import comm as dist
-
-        if dist.is_initialized():
-            output_tensors = [router_logits.clone() for _ in range(dist.get_world_size())]
-            dist.all_gather(output_tensors, router_logits)
-            router_logits = torch.cat(output_tensors, dim=1)
-
-    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-    routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
-    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-    # we cast back to the input dtype
-    routing_weights = routing_weights.to(hidden_states.dtype)
-
-    final_hidden_states = torch.zeros(
-        (batch_size, sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
-    )
-
-    padded_weights = torch.zeros(
-        (batch_size * sequence_length, self.num_experts), dtype=hidden_states.dtype, device=hidden_states.device
-    )
-    padded_weights.scatter_(-1, selected_experts, routing_weights)
-    padded_weights = padded_weights.reshape(-1, sequence_length, self.num_experts)
-    padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
-
-    # Loop over all available experts in the model and perform the computation on each expert
-    for expert_idx in range(self.num_experts):
-        expert_layer = self.experts[expert_idx]
-        padded_weight = padded_weights[expert_idx]
-        current_state_static = hidden_states.reshape(-1, hidden_dim)
-        current_hidden_states_static = (
-            expert_layer(current_state_static).reshape(-1, sequence_length, hidden_dim) * padded_weight
-        )
-        final_hidden_states += current_hidden_states_static
-        # support long sequences exceeding 8192
-        if not self.training and sequence_length > 8192:
-            htcore.mark_step()
-
-    return final_hidden_states, router_logits
-
-
-class GaudiMixtralDecoderLayer(MixtralDecoderLayer):
-    def __init__(self, config: MixtralConfig, layer_idx: int):
-        super().__init__(config, layer_idx)
-        self.self_attn = GaudiMixtralAttention(config, layer_idx)
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        output_router_logits: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-        lazy_mode: Optional[bool] = True,
-        reuse_cache: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        cache_idx: int = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Copied from MixtralDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py
-        The only differences are:
-        - add new args token_idx
-        - add new args lazy_mode
-        - add new args reuse_cache
-        - add new args flash_attention_recompute
-        - add new args cache_idx
-        """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        if lazy_mode:
-            htcore.mark_step()
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            token_idx=token_idx,
-            reuse_cache=reuse_cache,
-            flash_attention_recompute=flash_attention_recompute,
-            cache_idx=cache_idx,
-        )
-        hidden_states = residual + hidden_states
-        if lazy_mode:
-            htcore.mark_step()
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
-        hidden_states = residual + hidden_states
-        if lazy_mode:
-            htcore.mark_step()
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        if output_router_logits:
-            outputs += (router_logits,)
-
-        return outputs
-
-
-class GaudiMixtralModel(MixtralModel):
-    def __init__(self, config: MixtralConfig):
-        super().__init__(config)
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        for layer in self.layers:
-            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        lazy_mode: Optional[bool] = True,
-        reuse_cache: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        cache_idx: int = None,
-    ) -> Union[Tuple, MoeModelOutputWithPast]:
-        """
-        Copied from MixtralModel.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py#L1069
-        The only differences are:
-        - add new args token_idx
-        - add new args reuse_cache
-        - add new args flash_attention_recompute
-        - add new args cache_idx
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_router_logits = (
-            output_router_logits if output_router_logits is not None else self.config.output_router_logits
-        )
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        past_key_values_length = 0
-        use_new_cache = False  # Ignoring new Cache path for HPU
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        if past_key_values is not None and use_cache:
-            if reuse_cache:
-                past_key_values_length = past_key_values[0][0][2]
-            else:
-                if use_new_cache:
-                    if not isinstance(past_key_values, StaticCache):
-                        past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                    past_key_values_length = past_key_values.get_usable_length()
-                else:
-                    past_key_values_length = past_key_values[0][0].shape[2]
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self.config._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self.config._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self.config._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_router_logits = () if output_router_logits else None
-        next_decoder_cache = () if not use_new_cache else None
-
-        for layer_idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    output_router_logits,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=None if past_key_values is None else past_key_values[layer_idx],
-                    output_attentions=output_attentions,
-                    output_router_logits=output_router_logits,
-                    use_cache=use_cache,
-                    token_idx=token_idx,
-                    lazy_mode=lazy_mode,
-                    reuse_cache=reuse_cache,
-                    flash_attention_recompute=flash_attention_recompute,
-                    cache_idx=cache_idx,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-            if output_router_logits:
-                all_router_logits += (layer_outputs[-1],)
-
-            htcore.mark_step()
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
-            )
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                if v is not None
-            )
-        return MoeModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            router_logits=all_router_logits,
-        )
-
-
-class GaudiMixtralForCausalLM(MixtralForCausalLM):
-    """
-    Inherits from MixtralForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py#L1231
-    The only differences are:
-    - add new args token_idx
-    - add token_idx into model_inputs
-    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-    - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-    """
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-        self.kv_cache_len = max_seq_len
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        lazy_mode: Optional[bool] = True,
-        reuse_cache: Optional[bool] = None,
-        flash_attention_recompute: Optional[bool] = False,
-        cache_idx: int = None,
-    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_router_logits = (
-            output_router_logits if output_router_logits is not None else self.config.output_router_logits
-        )
-
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_router_logits=output_router_logits,
-            return_dict=return_dict,
-            token_idx=token_idx,
-            lazy_mode=lazy_mode,
-            reuse_cache=reuse_cache,
-            flash_attention_recompute=flash_attention_recompute,
-            cache_idx=cache_idx,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        aux_loss = None
-        if output_router_logits:
-            aux_loss = load_balancing_loss_func(
-                outputs.router_logits if return_dict else outputs[-1],
-                self.num_experts,
-                self.num_experts_per_tok,
-                attention_mask,
-            )
-            if labels is not None:
-                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            if output_router_logits:
-                output = (aux_loss,) + output
-            return (loss,) + output if loss is not None else output
-
-        return MoeCausalLMOutputWithPast(
-            loss=loss,
-            aux_loss=aux_loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            router_logits=outputs.router_logits,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        past_length = 0
-        reuse_cache = kwargs.get("reuse_cache")
-        token_idx = kwargs.get("token_idx", None)
-
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                    max_cache_length = past_key_values.get_max_length()
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-        elif reuse_cache and token_idx is not None:
-            # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass
-            input_ids = input_ids[:, :token_idx]
-            attention_mask = attention_mask[:, :token_idx]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-                "lazy_mode": kwargs.get("lazy_mode"),
-                "reuse_cache": reuse_cache,
-                "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
-                "cache_idx": kwargs.get("cache_idx"),
-            }
-        )
-        return model_inputs
diff --git a/optimum/habana/transformers/models/modeling_all_models.py b/optimum/habana/transformers/models/modeling_all_models.py
deleted file mode 100644
index 808e0f012a..0000000000
--- a/optimum/habana/transformers/models/modeling_all_models.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import Tuple
-
-import torch
-from transformers.modeling_utils import ModuleUtilsMixin, PretrainedConfig
-from transformers.utils.import_utils import is_torch_sdpa_available
-
-
-def gaudi_invert_attention_mask(self, encoder_attention_mask: torch.Tensor) -> torch.Tensor:
-    """
-    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/modeling_utils.py#L640
-    except that mixed precision is disabled for computing:
-        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min
-    """
-    if encoder_attention_mask.dim() == 3:
-        encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-    if encoder_attention_mask.dim() == 2:
-        encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-    # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
-    # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
-    # /transformer/transformer_layers.py#L270
-    # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
-    # encoder_extended_attention_mask.transpose(-1, -2))
-    # torch.finfo must take the dtype of encoder_extended_attention_mask
-    encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # bf16 compatibility
-    encoder_extended_attention_mask = 1.0 - encoder_extended_attention_mask
-    #  Fixes issue where the model is not in bf16 and mul is casting it to values out of range resulting in nan
-    with torch.autocast(enabled=False, device_type="hpu"):
-        encoder_extended_attention_mask = (
-            encoder_extended_attention_mask * torch.finfo(encoder_extended_attention_mask.dtype).min
-        )
-
-    return encoder_extended_attention_mask
-
-
-def gaudi_get_extended_attention_mask(
-    self, attention_mask: torch.Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
-) -> torch.Tensor:
-    """
-    Same as https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/modeling_utils.py#L692
-    except that mixed precision is disabled for computing:
-        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
-    """
-    if dtype is None:
-        dtype = self.dtype
-
-    if not (attention_mask.dim() == 2 and self.config.is_decoder):
-        # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
-        if device is not None:
-            warnings.warn(
-                "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
-            )
-    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-    # ourselves in which case we just need to make it broadcastable to all heads.
-    if attention_mask.dim() == 3:
-        extended_attention_mask = attention_mask[:, None, :, :]
-    elif attention_mask.dim() == 2:
-        # Provided a padding mask of dimensions [batch_size, seq_length]
-        # - if the model is a decoder, apply a causal mask in addition to the padding mask
-        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder:
-            extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
-                input_shape, attention_mask, device
-            )
-        else:
-            extended_attention_mask = attention_mask[:, None, None, :]
-    else:
-        raise ValueError(
-            f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
-        )
-
-    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-    # masked positions, this operation will create a tensor which is 0.0 for
-    # positions we want to attend and -10000.0 for masked positions.
-    # Since we are adding it to the raw scores before the softmax, this is
-    # effectively the same as removing these entirely.
-    # torch.finfo must take the dtype of encoder_extended_attention_mask
-    extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # bf16 compatibility
-    extended_attention_mask = 1.0 - extended_attention_mask
-    with torch.autocast(enabled=False, device_type="hpu"):
-        extended_attention_mask = extended_attention_mask * torch.finfo(extended_attention_mask.dtype).min
-
-    return extended_attention_mask
-
-
-def gaudi_conv1d_forward(self, x):
-    """
-    Same as https://github.com/huggingface/transformers/blob/3335724376319a0c453049d0cd883504f530ff52/src/transformers/pytorch_utils.py#L100
-    but moves reshape before view for tpc auto fusion.
-    """
-    size_out = x.size()[:-1] + (self.nf,)
-    x = torch.mm(x.view(-1, x.size(-1)), self.weight)
-    x = x.view(size_out)
-    x = x + self.bias
-    return x
-
-
-# Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa
-@classmethod
-def gaudi_check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
-    # This model doesn't support SDPA in Gaudi yet, fallback to original code.
-    MODELS_ATTN_IMPLEMENTATION_EAGER = ["bart", "gpt_bigcode", "mistral", "mixtral"]
-
-    if config.model_type in MODELS_ATTN_IMPLEMENTATION_EAGER:
-        config._attn_implementation = "eager"
-        return config
-
-    # Otherwise, fallback to original implementation
-    # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_utils.py#L1542
-    if hard_check_only:
-        if not cls._supports_sdpa:
-            raise ValueError(
-                f"{cls.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet."
-                " Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe"
-                ' this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`'
-            )
-        if not is_torch_sdpa_available():
-            raise ImportError("PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.")
-
-    if not is_torch_sdpa_available() or not cls._supports_sdpa:
-        return config
-
-    _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
-    if _is_bettertransformer:
-        return config
-
-    if not hard_check_only:
-        config._attn_implementation = "sdpa"
-
-    return config
-
-
-# Splitting DeepSpeed LinearAllReduce to three parts to avoid redundant memory consumption
-class ScopedLinearAllReduce(torch.nn.Module):
-    def __init__(self, mod, *args, **kwargs):
-        self.__dict__.update(mod.__dict__)
-
-    def forward(self, input):
-        # pre_all_reduce
-
-        output = torch.matmul(input, self.weight.transpose(-1, -2))
-        return output
-
-    def all_reduce(self, input):
-        if self.mp_group is not None:
-            from deepspeed import comm as dist
-
-            dist.inference_all_reduce(input, group=self.mp_group)
-
-    def post_all_reduce(self, input):
-        output = input + self.bias if (self.bias is not None) else input
-        return output
diff --git a/optimum/habana/transformers/models/mpt/__init__.py b/optimum/habana/transformers/models/mpt/__init__.py
deleted file mode 100644
index 1ab41c1a80..0000000000
--- a/optimum/habana/transformers/models/mpt/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_mpt import (
-    GaudiMptForCausalLM,
-    GaudiMptModel,
-    gaudi_mpt_attention_forward,
-    gaudi_mpt_block_forward,
-)
diff --git a/optimum/habana/transformers/models/mpt/modeling_mpt.py b/optimum/habana/transformers/models/mpt/modeling_mpt.py
deleted file mode 100644
index ed470f165a..0000000000
--- a/optimum/habana/transformers/models/mpt/modeling_mpt.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-###############################################################################
-# Copyright (C) 2022-2023 Habana Labs, Ltd. an Intel Company
-###############################################################################
-from typing import Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from transformers.models.mpt.modeling_mpt import MptForCausalLM, MptModel
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_mpt_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    position_bias: torch.Tensor,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    token_idx: Optional[torch.Tensor] = None,
-):
-    """
-    Copied from MptAttention.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
-    The only differences are:
-    - add new args token_idx
-    - optimize KV cache
-    """
-
-    batch_size, seq_length = hidden_states.shape[:2]
-
-    mixed_qkv = self.Wqkv(hidden_states)
-    bs, seq_len, three_times_hidden_size = mixed_qkv.shape
-    mixed_qkv = mixed_qkv.view(bs, seq_len, self.n_heads * 3, self.head_dim)
-    mixed_qkv = mixed_qkv.transpose(1, 2)
-    query_states, key_states, value_states = (
-        mixed_qkv[:, : self.n_heads, ...],
-        mixed_qkv[:, self.n_heads : 2 * self.n_heads, ...],
-        mixed_qkv[:, 2 * self.n_heads :, ...],
-    )
-
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            if token_idx is not None:
-                past_key_value[0].index_copy_(2, token_idx - 1, key_states)
-                past_key_value[1].index_copy_(2, token_idx - 1, value_states)
-                key_states = past_key_value[0]
-                value_states = past_key_value[1]
-            else:
-                key_states = torch.cat([past_key_value[0], key_states], dim=2)
-                value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        past_key_value = (key_states, value_states)
-    else:
-        past_key_value = (key_states, value_states)
-
-    attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * self.softmax_scale
-
-    query_length = seq_length if past_key_value is None else seq_length + past_key_value[0].shape[2]
-
-    if position_bias is not None:
-        if len(position_bias.shape) != 3:
-            raise ValueError(f"Expecting position_bias shape to be 3 dimensions, got {len(position_bias.shape)}")
-        key_length = key_states.shape[-2]
-
-        position_bias_query_index = max(0, position_bias.size(1) - query_length)
-        position_bias_key_index = max(0, position_bias.size(2) - key_length)
-
-        position_bias = position_bias[:, position_bias_query_index:, position_bias_key_index:]
-
-        attention_scores = attention_scores + position_bias
-
-    if attention_mask is not None:
-        attention_scores = attention_scores.masked_fill(attention_mask, torch.finfo(query_states.dtype).min)
-
-    # (batch_size, n_heads, seq_length, key_length)
-    attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).to(value_states.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=self.attn_dropout_p, training=self.training)
-
-    context_states = torch.matmul(attn_weights, value_states)
-    context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
-    attn_output = self.out_proj(context_states)
-
-    return attn_output, attn_weights, past_key_value
-
-
-def gaudi_mpt_block_forward(
-    self,
-    hidden_states: torch.Tensor,
-    position_bias: torch.Tensor,
-    attention_mask: torch.Tensor,
-    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    use_cache: bool = False,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-):
-    """
-    Copied from MptBlock.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
-    The only differences are:
-    - add new args token_idx
-    """
-    # hidden_states: [batch_size, seq_length, hidden_size]
-    # Layer norm at the beginning of the transformer layer.
-    layernorm_output = self.norm_1(hidden_states)
-
-    residual = hidden_states
-
-    # Self attention.
-    attn_outputs, attn_weights, past_key_value = self.attn(
-        layernorm_output,
-        position_bias=position_bias,
-        attention_mask=attention_mask,
-        past_key_value=layer_past,
-        token_idx=token_idx,
-    )
-
-    hidden_states = self.resid_attn_dropout(attn_outputs) + residual
-
-    layernorm_output = self.norm_2(hidden_states)
-
-    # Get residual
-    residual = hidden_states
-
-    # MLP.
-    output = self.ffn(layernorm_output, residual)
-    outputs = (output,)
-
-    if use_cache:
-        outputs += (past_key_value,)
-
-    if output_attentions:
-        outputs += (attn_weights,)
-
-    return outputs  # hidden_states, present, attentions
-
-
-class GaudiMptModel(MptModel):
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        """
-        Copied from MptModel.forward: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
-        The only differences are:
-        - add new args token_idx
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.blocks))
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-
-        hidden_states = inputs_embeds
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None and token_idx is None:  # because RW-cache, not standard format
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-
-        alibi = self.build_mpt_alibi_tensor(self.num_heads, self.config.max_seq_len, device=hidden_states.device)
-
-        causal_mask = _gaudi_prepare_4d_causal_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-        causal_mask = causal_mask.bool()
-
-        for block, layer_past in zip(self.blocks, past_key_values):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                outputs = self._gradient_checkpointing_func(
-                    block.__call__,
-                    hidden_states,
-                    alibi,
-                    causal_mask,
-                    layer_past,
-                    use_cache,
-                    output_attentions,
-                    None,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    position_bias=alibi,
-                    token_idx=token_idx,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        # Add last hidden state
-        hidden_states = self.norm_f(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class GaudiMptForCausalLM(MptForCausalLM):
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> dict:
-        """
-        Inherits from MptForCausalLM: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
-        The only differences are:
-        - add new args token_idx
-        - add token_idx into model_inputs
-        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-        """
-        # only last tokens for input_ids if past is not None
-        if past_key_values is not None:
-            if token_idx is None:
-                past_length = past_key_values[0][0].shape[2]
-
-                # Some generation methods already pass only the last input ID
-                if input_ids.shape[1] > past_length:
-                    remove_prefix_length = past_length
-                else:
-                    # Default to old behavior: keep only final ID
-                    remove_prefix_length = input_ids.shape[1] - 1
-
-                input_ids = input_ids[:, remove_prefix_length:]
-            else:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,  # NITS should it be layer_past?
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        """
-        Inherits from MptForCausalLM: https://github.com/huggingface/transformers/blob/v4.32.0/src/transformers/models/mpt/modeling_mpt.py
-        The only differences are:
-        - add new args token_idx
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            batch_size, seq_length, vocab_size = shift_logits.shape
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/optimum/habana/transformers/models/opt/__init__.py b/optimum/habana/transformers/models/opt/__init__.py
deleted file mode 100644
index 9ea5a435ee..0000000000
--- a/optimum/habana/transformers/models/opt/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .modeling_opt import (
-    GaudiOPTForCausalLM,
-    GaudiOPTLearnedPositionalEmbedding,
-    gaudi_opt_attention_forward,
-    gaudi_opt_decoder_forward,
-    gaudi_opt_decoder_layer_forward,
-    gaudi_opt_model_forward,
-)
diff --git a/optimum/habana/transformers/models/opt/modeling_opt.py b/optimum/habana/transformers/models/opt/modeling_opt.py
deleted file mode 100644
index 9f113453e9..0000000000
--- a/optimum/habana/transformers/models/opt/modeling_opt.py
+++ /dev/null
@@ -1,537 +0,0 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.opt.modeling_opt import OPTForCausalLM, OPTLearnedPositionalEmbedding, logger
-
-from ...modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
-
-
-class GaudiOPTLearnedPositionalEmbedding(OPTLearnedPositionalEmbedding):
-    """
-    Inherits from OPTLearnedPositionalEmbedding: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
-    The only differences are:
-    - add new args token_idx
-    - compute embedding using token_idx if past_key_values_length not 0
-    """
-
-    def forward(
-        self,
-        attention_mask: torch.LongTensor,
-        past_key_values_length: int = 0,
-        token_idx: Optional[torch.Tensor] = None,
-    ):
-        attention_mask = attention_mask.long()
-
-        if past_key_values_length == 0:
-            # first step or kv cache disabled
-            positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
-            positions = positions[:, past_key_values_length:]
-            return torch.nn.Embedding.forward(self, positions + self.offset)
-        else:
-            # if not 0, kv cache is enabled and from step = 2, past_key_values_length is equal to the final length of outputs
-            return torch.nn.Embedding.forward(self, token_idx + self.offset)
-
-
-def gaudi_opt_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    key_value_states: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    layer_head_mask: Optional[torch.Tensor] = None,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """
-    Copied from OPTAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
-    The only differences are:
-    - add new args token_idx
-    - optimize KV cache
-    """
-    # if key_value_states are provided this layer is used as a cross-attention layer
-    # for the decoder
-    is_cross_attention = key_value_states is not None
-
-    bsz, tgt_len, _ = hidden_states.size()
-
-    # get query proj
-    query_states = self.q_proj(hidden_states) * self.scaling
-    # get key, value proj
-    if is_cross_attention and past_key_value is not None:
-        # reuse k,v, cross_attentions
-        key_states = past_key_value[0]
-        value_states = past_key_value[1]
-    elif is_cross_attention:
-        # cross_attentions
-        key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-        value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-    elif past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        if token_idx is not None:
-            past_key_value[0].index_copy_(2, token_idx - 1, key_states)
-            past_key_value[1].index_copy_(2, token_idx - 1, value_states)
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        else:
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-    else:
-        # self_attention
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-    if self.is_decoder:
-        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-        # Further calls to cross_attention layer can then reuse all cross-attention
-        # key/value_states (first "if" case)
-        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-        # all previous decoder key/value_states. Further calls to uni-directional self-attention
-        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-        # if encoder bi-directional self-attention `past_key_value` is always `None`
-        past_key_value = (key_states, value_states)
-
-    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-    key_states = key_states.view(*proj_shape)
-    value_states = value_states.view(*proj_shape)
-
-    src_len = key_states.size(1)
-    attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-    if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-        raise ValueError(
-            f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-            f" {attn_weights.size()}"
-        )
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-            )
-        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-        attn_weights = torch.max(
-            attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
-        )
-        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
-
-    if layer_head_mask is not None:
-        if layer_head_mask.size() != (self.num_heads,):
-            raise ValueError(
-                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                f" {layer_head_mask.size()}"
-            )
-        attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-    if output_attentions:
-        # this operation is a bit awkward, but it's required to
-        # make sure that attn_weights keeps its gradient.
-        # In order to do so, attn_weights have to be reshaped
-        # twice and have to be reused in the following
-        attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-    else:
-        attn_weights_reshaped = None
-
-    attn_probs = torch.nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-    attn_output = torch.bmm(attn_probs, value_states)
-
-    if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-    attn_output = attn_output.transpose(1, 2)
-
-    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-    # partitioned aross GPUs when using tensor-parallelism.
-    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-    attn_output = self.out_proj(attn_output)
-
-    return attn_output, attn_weights_reshaped, past_key_value
-
-
-def gaudi_opt_decoder_layer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    layer_head_mask: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,
-    use_cache: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    """
-    Copied from OPTDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
-    The only differences are:
-    - add new args token_idx
-    """
-    residual = hidden_states
-
-    # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-    if self.do_layer_norm_before:
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-
-    # Self Attention
-    hidden_states, self_attn_weights, present_key_value = self.self_attn(
-        hidden_states=hidden_states,
-        past_key_value=past_key_value,
-        attention_mask=attention_mask,
-        layer_head_mask=layer_head_mask,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    hidden_states = torch.nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-    hidden_states = residual + hidden_states
-
-    # 350m applies layer norm AFTER attention
-    if not self.do_layer_norm_before:
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-
-    # Fully Connected
-    hidden_states_shape = hidden_states.shape
-    hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
-    residual = hidden_states
-
-    # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-    if self.do_layer_norm_before:
-        hidden_states = self.final_layer_norm(hidden_states)
-
-    hidden_states = self.fc1(hidden_states)
-    hidden_states = self.activation_fn(hidden_states)
-
-    hidden_states = self.fc2(hidden_states)
-    hidden_states = torch.nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-    hidden_states = (residual + hidden_states).view(hidden_states_shape)
-
-    # 350m applies layer norm AFTER attention
-    if not self.do_layer_norm_before:
-        hidden_states = self.final_layer_norm(hidden_states)
-
-    outputs = (hidden_states,)
-
-    if output_attentions:
-        outputs += (self_attn_weights,)
-
-    if use_cache:
-        outputs += (present_key_value,)
-
-    return outputs
-
-
-def gaudi_opt_decoder_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from OPTDecoder.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
-    The only differences are:
-    - add new args token_idx
-    - update calculation of mask_seq_length
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-    elif input_ids is not None:
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-    else:
-        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-
-    batch_size, seq_length = input_shape
-    if past_key_values is not None:
-        past_key_values_length = past_key_values[0][0].shape[2]
-        mask_seq_length = past_key_values_length
-    else:
-        past_key_values_length = 0
-        mask_seq_length = seq_length
-
-    # embed positions
-    # 4d mask is passed through the layers
-    if attention_mask is None:
-        attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-    elif attention_mask.shape[1] != mask_seq_length:
-        raise ValueError(
-            f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
-            f"{mask_seq_length} (sum of the lengths of current and past inputs)"
-        )
-    causal_attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-        attention_mask, input_shape, inputs_embeds, past_key_values_length
-    )
-
-    pos_embeds = self.embed_positions(attention_mask, past_key_values_length, token_idx)
-
-    if self.project_in is not None:
-        inputs_embeds = self.project_in(inputs_embeds)
-
-    hidden_states = inputs_embeds + pos_embeds
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = () if use_cache else None
-
-    # check if head_mask has a correct number of layers specified if desired
-    for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
-        if attn_mask is not None:
-            if attn_mask.size()[0] != (len(self.layers)):
-                raise ValueError(
-                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                    f" {head_mask.size()[0]}."
-                )
-
-    for idx, decoder_layer in enumerate(self.layers):
-        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if self.training:
-            dropout_probability = torch.rand([])
-            if dropout_probability < self.layerdrop:
-                continue
-
-        past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                causal_attention_mask,
-                head_mask[idx] if head_mask is not None else None,
-                None,
-                output_attentions,
-                use_cache,
-                None,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=causal_attention_mask,
-                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                token_idx=token_idx,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    if self.final_layer_norm is not None:
-        hidden_states = self.final_layer_norm(hidden_states)
-
-    if self.project_out is not None:
-        hidden_states = self.project_out(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = next_decoder_cache if use_cache else None
-    if not return_dict:
-        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-def gaudi_opt_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from OPTModel.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
-    The only differences are:
-    - add new args token_idx
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-    decoder_outputs = self.decoder(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        head_mask=head_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        token_idx=token_idx,
-    )
-
-    if not return_dict:
-        return decoder_outputs
-
-    return BaseModelOutputWithPast(
-        last_hidden_state=decoder_outputs.last_hidden_state,
-        past_key_values=decoder_outputs.past_key_values,
-        hidden_states=decoder_outputs.hidden_states,
-        attentions=decoder_outputs.attentions,
-    )
-
-
-class GaudiOPTForCausalLM(OPTForCausalLM):
-    """
-    Inherits from OPTForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
-    The only differences are:
-    - add new args token_idx
-    - add token_idx into model_inputs
-    - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-    """
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-
-        logits = self.lm_head(outputs[0]).contiguous()
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, token_idx=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values is not None:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                past_length = past_key_values[0][0].shape[2]
-
-                # Some generation methods already pass only the last input ID
-                if input_ids.shape[1] > past_length:
-                    remove_prefix_length = past_length
-                else:
-                    # Default to old behavior: keep only final ID
-                    remove_prefix_length = input_ids.shape[1] - 1
-
-                input_ids = input_ids[:, remove_prefix_length:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
diff --git a/optimum/habana/transformers/models/owlvit/__init__.py b/optimum/habana/transformers/models/owlvit/__init__.py
deleted file mode 100644
index 83b5542fec..0000000000
--- a/optimum/habana/transformers/models/owlvit/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .modeling_owlvit import (
-    gaudi_owlvitclasspredictionhead_forward,
-)
diff --git a/optimum/habana/transformers/models/owlvit/modeling_owlvit.py b/optimum/habana/transformers/models/owlvit/modeling_owlvit.py
deleted file mode 100644
index 4ba4cd2ad1..0000000000
--- a/optimum/habana/transformers/models/owlvit/modeling_owlvit.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import Optional, Tuple
-
-import torch
-
-
-def gaudi_owlvitclasspredictionhead_forward(
-    self,
-    image_embeds: torch.FloatTensor,
-    query_embeds: Optional[torch.FloatTensor],
-    query_mask: Optional[torch.Tensor],
-) -> Tuple[torch.FloatTensor]:
-    """
-    Copied from modeling_owlvit: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/owlvit/modeling_owlvit.py#L1233
-    The only modification is:
-        - Replace the torch.where by torch.clamp
-    """
-
-    image_class_embeds = self.dense0(image_embeds)
-    if query_embeds is None:
-        device = image_class_embeds.device
-        batch_size, num_patches = image_class_embeds.shape[:2]
-        pred_logits = torch.zeros((batch_size, num_patches, self.query_dim)).to(device)
-        return (pred_logits, image_class_embeds)
-
-    # Normalize image and text features
-    image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
-    query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)
-
-    # Get class predictions
-    pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)
-
-    # Apply a learnable shift and scale to logits
-    logit_shift = self.logit_shift(image_embeds)
-    logit_scale = self.logit_scale(image_embeds)
-    logit_scale = self.elu(logit_scale) + 1
-    pred_logits = (pred_logits + logit_shift) * logit_scale
-
-    if query_mask is not None:
-        if query_mask.ndim > 1:
-            query_mask = torch.unsqueeze(query_mask, dim=-2)
-
-        pred_logits = pred_logits.to(torch.float64)
-        pred_logits = torch.clamp(pred_logits, min=-1e6)
-        pred_logits = pred_logits.to(torch.float32)
-
-    return (pred_logits, image_class_embeds)
diff --git a/optimum/habana/transformers/models/persimmon/__init__.py b/optimum/habana/transformers/models/persimmon/__init__.py
deleted file mode 100644
index 8f0694dc34..0000000000
--- a/optimum/habana/transformers/models/persimmon/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_persimmon import (
-    GaudiPersimmonForCausalLM,
-    gaudi_persimmon_attention_forward,
-    gaudi_persimmon_decoder_layer_forward,
-    gaudi_persimmon_model_forward,
-)
diff --git a/optimum/habana/transformers/models/persimmon/modeling_persimmon.py b/optimum/habana/transformers/models/persimmon/modeling_persimmon.py
deleted file mode 100644
index 4ecfd3264f..0000000000
--- a/optimum/habana/transformers/models/persimmon/modeling_persimmon.py
+++ /dev/null
@@ -1,457 +0,0 @@
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.persimmon.modeling_persimmon import PersimmonForCausalLM, apply_rotary_pos_emb
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_persimmon_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Cache] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """
-    Copied from PersimmonAttention.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/persimmon/modeling_persimmon.py
-    The only differences are:
-    - add new args token_idx
-    - optimize KV cache
-    """
-    bsz, q_len, _ = hidden_states.size()
-
-    # [batch_size, seq_length, 3 x hidden_size]
-    fused_qkv = self.query_key_value(hidden_states)
-
-    # 3 x [batch_size, seq_length, num_heads, head_dim]
-    (query_states, key_states, value_states) = self._split_heads(fused_qkv)
-
-    if self.qk_layernorm:
-        query_states = self.q_layernorm(query_states)
-        key_states = self.k_layernorm(key_states)
-
-    # [batch_size, num_heads, seq_length, head_dim] -> [batch_size, seq_length, num_heads, head_dim]
-    query_states = query_states.transpose(1, 2)
-    value_states = value_states.transpose(1, 2)
-    key_states = key_states.transpose(1, 2)
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        if self.layer_idx is None:
-            raise ValueError(
-                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                "with a layer index."
-            )
-        if token_idx is not None and past_key_value.get_usable_length(kv_seq_len, self.layer_idx) > 0:
-            # When token_idx is used, static seq len = (input token len + max output token len)
-            kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        else:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-    # Partial rotary embedding
-    query_rot, query_pass = (
-        query_states[..., : self.rotary_emb.dim],
-        query_states[..., self.rotary_emb.dim :],
-    )
-    key_rot, key_pass = (
-        key_states[..., : self.rotary_emb.dim],
-        key_states[..., self.rotary_emb.dim :],
-    )
-    # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
-    query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
-
-    # [batch_size, seq_length, num_heads, head_dim]
-    query_states = torch.cat((query_rot, query_pass), dim=-1)
-    key_states = torch.cat((key_rot, key_pass), dim=-1)
-
-    if past_key_value is not None:
-        if token_idx is not None:
-            if 0 <= self.layer_idx < len(past_key_value.key_cache):
-                past_key_value.key_cache[self.layer_idx].index_copy_(2, token_idx - 1, key_states)
-                past_key_value.value_cache[self.layer_idx].index_copy_(2, token_idx - 1, value_states)
-                key_states = past_key_value.key_cache[self.layer_idx]
-                value_states = past_key_value.value_cache[self.layer_idx]
-            else:
-                past_key_value.key_cache.append(key_states)
-                past_key_value.value_cache.append(value_states)
-        else:
-            # Specific to RoPE models with partial rotation
-            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-        raise ValueError(
-            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-            f" {attn_weights.size()}"
-        )
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-            )
-        attn_weights = attn_weights + attention_mask
-
-    # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query_states.dtype)
-    attn_weights = self.attention_dropout(attn_weights)
-
-    attn_output = torch.matmul(attn_weights, value_states)
-
-    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-    attn_output = self.dense(attn_output)
-
-    if not output_attentions:
-        attn_weights = None
-
-    return attn_output, attn_weights, past_key_value
-
-
-def gaudi_persimmon_decoder_layer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,
-    use_cache: Optional[bool] = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    """
-    Copied from PersimmonDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/persimmon/modeling_persimmon.py
-    The only differences are:
-    - add new args token_idx
-    """
-    residual = hidden_states
-
-    hidden_states = self.input_layernorm(hidden_states)
-
-    # Self Attention
-    hidden_states, self_attn_weights, present_key_value = self.self_attn(
-        hidden_states=hidden_states,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_value=past_key_value,
-        output_attentions=output_attentions,
-        use_cache=use_cache,
-        token_idx=token_idx,
-    )
-    hidden_states = residual + hidden_states
-
-    # Fully Connected
-    residual = hidden_states
-    hidden_states = self.post_attention_layernorm(hidden_states)
-    hidden_states = self.mlp(hidden_states)
-
-    hidden_states = self.dropout(hidden_states)
-    hidden_states = hidden_states + residual
-
-    outputs = (hidden_states,)
-
-    if output_attentions:
-        outputs += (self_attn_weights,)
-
-    if use_cache:
-        outputs += (present_key_value,)
-
-    return outputs
-
-
-def gaudi_persimmon_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from PersimmonModel.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/persimmon/modeling_persimmon.py
-    The only differences are:
-    - add new args token_idx
-    - replace _prepare_4d_causal_attention_mask with _gaudi_prepare_4d_causal_attention_mask
-    """
-
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-    elif input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-    seq_length_with_past = seq_length
-    past_key_values_length = 0
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    if use_cache:
-        use_legacy_cache = not isinstance(past_key_values, Cache)
-        if use_legacy_cache:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        if token_idx is None:
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-        )
-        position_ids = position_ids.unsqueeze(0)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-    # embed positions
-    if attention_mask is None:
-        attention_mask = torch.ones((batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device)
-    attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-        attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-    )
-
-    hidden_states = inputs_embeds
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = None
-
-    for decoder_layer in self.layers:
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                position_ids,
-                past_key_values,
-                output_attentions,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                token_idx=token_idx,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    hidden_states = self.final_layernorm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = None
-    if use_cache:
-        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-class GaudiPersimmonForCausalLM(PersimmonForCausalLM):
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        """
-        Inherits from PersimmonForCausalLM: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/persimmon/modeling_persimmon.py
-        The only differences are:
-        - add new args token_idx
-        """
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        """
-        Inherits from PersimmonForCausalLM: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/persimmon/modeling_persimmon.py
-        The only differences are:
-        - add new args token_idx
-        - add token_idx into model_inputs
-        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-        """
-        token_idx = kwargs.get("token_idx", None)
-        if past_key_values is not None:
-            if token_idx is None:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                    max_cache_length = past_key_values.get_max_length()
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-            else:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
diff --git a/optimum/habana/transformers/models/phi/__init__.py b/optimum/habana/transformers/models/phi/__init__.py
deleted file mode 100644
index f7429a79df..0000000000
--- a/optimum/habana/transformers/models/phi/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_phi import (
-    GaudiPhiAttention,
-    GaudiPhiDecoderLayer,
-    GaudiPhiForCausalLM,
-    GaudiPhiModel,
-)
diff --git a/optimum/habana/transformers/models/phi/modeling_phi.py b/optimum/habana/transformers/models/phi/modeling_phi.py
deleted file mode 100644
index 4c9d9dd4d2..0000000000
--- a/optimum/habana/transformers/models/phi/modeling_phi.py
+++ /dev/null
@@ -1,672 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Phi model."""
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.phi.configuration_phi import PhiConfig
-from transformers.models.phi.modeling_phi import (
-    PhiAttention,
-    PhiForCausalLM,
-    PhiMLP,
-    PhiModel,
-    apply_rotary_pos_emb,
-)
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_phi_repeat_kv(
-    query_states: torch.Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
-    attention_mask: torch.Tensor,
-    n_rep: int,
-):
-    """
-    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/models/phi/modeling_phi.py
-    The only differences are:
-        - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
-        - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
-    if n_rep == 1 or num_key_value_heads == 1:
-        return query_states, key_states, value_states, attention_mask
-
-    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
-    key_states = key_states.reshape(new_kv_shape)
-    value_states = value_states.reshape(new_kv_shape)
-
-    batch, _, q_len, head_dim = query_states.shape
-    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
-    query_states = query_states.reshape(new_q_shape)
-
-    if attention_mask is not None:
-        # Add groups dim and set to 1
-        attention_mask = attention_mask.unsqueeze(1)
-
-    return query_states, key_states, value_states, attention_mask
-
-
-class Matmul(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
-class KVCache(torch.nn.Module):
-    def __init__(self):
-        super(KVCache, self).__init__()
-        self.cache = None
-        self.inp_seq_len = -1
-
-    def allocate(self, inp_seq_len, dtype, device, shape):
-        if self.cache is None or self.cache.shape != shape:
-            self.inp_seq_len = inp_seq_len
-            self.cache = torch.zeros(shape, dtype=dtype, device=device)
-        else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
-            self.cache.fill_(0)
-
-    def update(self, prev, cur, dim, idx, inp_seq_len):
-        orig_cur = cur
-        if prev.shape == cur.shape:
-            prev.copy_(cur)
-            return orig_cur
-        if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
-            # Initialize
-            prev[:, :, :inp_seq_len, :].copy_(cur)
-            return orig_cur
-        assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
-        if idx is not None:
-            prev.index_copy_(dim, idx - 1, cur)
-            return prev
-        else:
-            return torch.cat((prev, cur), dim=dim)
-
-    def get_shape(self):
-        if self.cache is None:
-            return None
-        return self.cache.shape
-
-    def forward(self, cur, dim, idx):
-        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
-
-
-class GaudiPhiAttention(PhiAttention):
-    def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-        self.matmul_qk = Matmul()
-        self.matmul_av = Matmul()
-        self.k_cache = KVCache()
-        self.v_cache = KVCache()
-        self.inp_seq_len = -1
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim)
-        device = self.k_proj.weight.device
-        dtype = self.config.torch_dtype
-        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: Optional[int] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """
-        Copied from PhiAttention.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
-        The only differences are:
-        - add new args token_idx
-        - optimize KV cache
-        - add new args reuse_cache
-        - add new args cache_idx
-        """
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        if self.qk_layernorm:
-            query_states = self.q_layernorm(query_states)
-            key_states = self.k_layernorm(key_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_shape = (
-                (past_key_value[0][-2] if reuse_cache else past_key_value[0].shape[-2])
-                if isinstance(past_key_value, tuple)
-                else past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            )
-            if token_idx is not None:
-                kv_seq_len = kv_shape
-            else:
-                kv_seq_len += kv_shape
-
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        # Partial rotary embedding
-        query_rot, query_pass = (
-            query_states[..., : self.rotary_emb.dim],
-            query_states[..., self.rotary_emb.dim :],
-        )
-        key_rot, key_pass = (
-            key_states[..., : self.rotary_emb.dim],
-            key_states[..., self.rotary_emb.dim :],
-        )
-        # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
-        query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
-
-        # [batch_size, seq_length, num_heads, head_dim]
-        query_states = torch.cat((query_rot, query_pass), dim=-1)
-        key_states = torch.cat((key_rot, key_pass), dim=-1)
-
-        if use_cache:
-            # reuse k, v, self_attention
-            if reuse_cache:
-                key_states = self.k_cache(key_states, 2, token_idx)
-                value_states = self.v_cache(value_states, 2, token_idx)
-                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
-            else:
-                if past_key_value is None:
-                    past_key = torch.zeros(key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device)
-                    past_value = torch.zeros(
-                        key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device
-                    )
-                    past_key_value = (past_key, past_value)
-                key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
-                value_states = self.v_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
-                if token_idx is None:
-                    past_key_value = (key_states, value_states)
-
-            if cache_idx is not None and q_len == 1:
-                key_states = key_states[:, :, :cache_idx, :]
-                value_states = value_states[:, :, :cache_idx, :]
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, :, :, :cache_idx]
-                kv_seq_len = key_states.shape[-2]
-        else:
-            past_key_value = None
-
-        query_states, key_states, value_states, attention_mask = gaudi_phi_repeat_kv(
-            query_states, key_states, value_states, attention_mask, self.num_key_value_groups
-        )
-
-        # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow
-        attn_weights = self.matmul_qk(
-            query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3)
-        ) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() not in [
-            (bsz, self.num_heads, q_len, kv_seq_len),
-            (bsz, self.num_key_value_heads, self.num_key_value_groups, q_len, kv_seq_len),
-        ]:
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)} or"
-                f" {(bsz, self.num_key_value_heads, self.num_key_value_groups, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() not in [(bsz, 1, q_len, kv_seq_len), (bsz, 1, 1, q_len, kv_seq_len)]:
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)} or {(bsz, 1, 1, q_len, kv_seq_len)},"
-                    f" but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-
-        attn_output = self.matmul_av(attn_weights, value_states)
-        attn_output = attn_output.reshape(bsz, -1, q_len, self.head_dim)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.dense(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class GaudiPhiDecoderLayer(torch.nn.Module):
-    def __init__(self, config: PhiConfig, layer_idx: int):
-        super().__init__()
-        self.self_attn = GaudiPhiAttention(config, layer_idx=layer_idx)
-        self.mlp = PhiMLP(config)
-        self.input_layernorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.resid_dropout = torch.nn.Dropout(config.resid_pdrop)
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: Optional[int] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Copied from PhiDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
-        The only differences are:
-        - add new args token_idx
-        - add new args reuse_cache
-        - add new args cache_idx
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            token_idx=token_idx,
-            reuse_cache=reuse_cache,
-            cache_idx=cache_idx,
-        )
-        attn_outputs = self.resid_dropout(attn_outputs)
-
-        feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
-        hidden_states = attn_outputs + feed_forward_hidden_states + residual
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-class GaudiPhiModel(PhiModel):
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        for layer in self.layers:
-            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        cache_idx: Optional[int] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        """
-        Copied from PhiModel.forward: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
-        The only differences are:
-        - add new args token_idx
-        - add new args reuse_cache
-        - add new args cache_idx
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-        use_legacy_cache = True
-        use_new_cache = False
-        past_seen_tokens = 0
-        if past_key_values is not None and use_cache:
-            if reuse_cache:
-                past_seen_tokens = past_key_values[0][0][2]
-            else:
-                if use_new_cache:
-                    use_legacy_cache = not isinstance(past_key_values, Cache)
-                    if use_legacy_cache:
-                        past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                    past_seen_tokens = past_key_values.get_seq_length()
-                else:
-                    past_seen_tokens = past_key_values[0][0].shape[2]
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_seen_tokens, seq_length + past_seen_tokens, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        inputs_embeds = self.embed_dropout(inputs_embeds)
-
-        # 4d mask is passed through the layers
-        attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_seen_tokens
-        )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if not use_new_cache else None
-
-        for layer_idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None if past_key_values is None else past_key_values[layer_idx],
-                    output_attentions,
-                    use_cache,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=None if past_key_values is None else past_key_values[layer_idx],
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    token_idx=token_idx,
-                    reuse_cache=reuse_cache,
-                    cache_idx=cache_idx,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.final_layernorm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
-            )
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class GaudiPhiForCausalLM(PhiForCausalLM):
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        reuse_cache: Optional[bool] = False,
-        trim_logits: Optional[bool] = False,
-        cache_idx: Optional[int] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        """
-        Inherits from PhiForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
-        The only differences are:
-        - add new args token_idx
-        - add new args reuse_cache
-        - add new args cache_idx
-        """
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-            reuse_cache=reuse_cache,
-            cache_idx=cache_idx,
-        )
-
-        hidden_states = outputs[0]
-        _, seq_len, _ = hidden_states.shape
-        if seq_len > 1 and trim_logits and not self.training:
-            if token_idx is not None:
-                hidden_states = hidden_states.index_select(1, token_idx - 1)
-            else:
-                hidden_states = hidden_states[:, -1, :]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, token_idx=None, **kwargs
-    ):
-        """
-        Inherits from PhiForCausalLM: https://github.com/huggingface/transformers/blob/v4.37.1/src/transformers/models/phi/modeling_phi.py
-        The only differences are:
-        - add new args token_idx
-        - add token_idx into model_inputs
-        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-        """
-        past_length = 0
-        reuse_cache = kwargs.get("reuse_cache")
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                    max_cache_length = past_key_values.get_max_length()
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-        elif reuse_cache and token_idx is not None:
-            # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass
-            input_ids = input_ids[:, :token_idx]
-            attention_mask = attention_mask[:, :token_idx]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-                "reuse_cache": kwargs.get("reuse_cache"),
-                "trim_logits": kwargs.get("trim_logits"),
-                "cache_idx": kwargs.get("cache_idx"),
-            }
-        )
-        return model_inputs
diff --git a/optimum/habana/transformers/models/qwen2/__init__.py b/optimum/habana/transformers/models/qwen2/__init__.py
deleted file mode 100644
index 8103996d84..0000000000
--- a/optimum/habana/transformers/models/qwen2/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .modeling_qwen2 import (
-    GaudiQwen2Attention,
-    GaudiQwen2DecoderLayer,
-    GaudiQwen2ForCausalLM,
-    GaudiQwen2MLP,
-    GaudiQwen2Model,
-    gaudi_qwen2_rmsnorm_forward,
-)
diff --git a/optimum/habana/transformers/models/qwen2/modeling_qwen2.py b/optimum/habana/transformers/models/qwen2/modeling_qwen2.py
deleted file mode 100644
index f192cf4898..0000000000
--- a/optimum/habana/transformers/models/qwen2/modeling_qwen2.py
+++ /dev/null
@@ -1,909 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-###############################################################################
-# Copyright (C) 2022-2024 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
-from transformers.models.qwen2.modeling_qwen2 import (
-    Qwen2Attention,
-    Qwen2DecoderLayer,
-    Qwen2ForCausalLM,
-    Qwen2MLP,
-    Qwen2Model,
-    Qwen2RMSNorm,
-    apply_rotary_pos_emb,
-    logger,
-)
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-try:
-    from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
-except ImportError:
-    print("Not using HPU fused kernel for apply_rotary_pos_emb")
-    FusedRoPE = None
-
-try:
-    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
-except ImportError:
-    print("Not using HPU fused kernel for RMSNorm")
-    FusedRMSNorm = None
-
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-except ImportError:
-    print("Not using HPU fused scaled dot-product attention kernel.")
-    FusedSDPA = None
-
-
-import habana_frameworks.torch.core as htcore
-
-
-def gaudi_qwen2_rmsnorm_forward(self, hidden_states):
-    if hidden_states.device.type == "hpu" and FusedRMSNorm:
-        # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
-        if hidden_states.dtype != self.weight.dtype:
-            orig_dtype = hidden_states.dtype
-            hidden_states = FusedRMSNorm.apply(hidden_states.to(self.weight.dtype), self.weight, self.variance_epsilon)
-            return hidden_states.to(orig_dtype)
-        else:
-            hidden_states = FusedRMSNorm.apply(hidden_states, self.weight, self.variance_epsilon)
-            return hidden_states
-    else:
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class GaudiQwen2MLP(Qwen2MLP):
-    def pre_mlp_forward(self, x):
-        inputs = self.act_fn(self.gate_proj(x)) * self.up_proj(x)
-        output = self.down_proj(inputs)
-        return output
-
-    def mlp_all_reduce(self, x):
-        if hasattr(self.down_proj, "all_reduce"):
-            self.down_proj.all_reduce(x)
-
-    def post_mlp_forward(self, x):
-        if hasattr(self.down_proj, "post_all_reduce"):
-            return self.down_proj.post_all_reduce(x)
-        return x
-
-
-def gaudi_qwen2_repeat_kv(
-    query_states: torch.Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
-    attention_mask: torch.Tensor,
-    n_rep: int,
-):
-    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
-    if n_rep == 1 or num_key_value_heads == 1:
-        return query_states, key_states, value_states, attention_mask
-
-    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
-    key_states = key_states.reshape(new_kv_shape)
-    value_states = value_states.reshape(new_kv_shape)
-
-    batch, _, q_len, head_dim = query_states.shape
-    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
-    query_states = query_states.reshape(new_q_shape)
-
-    if attention_mask is not None:
-        # Add groups dim and set to 1
-        attention_mask = attention_mask.unsqueeze(1)
-
-    return query_states, key_states, value_states, attention_mask
-
-
-class Matmul(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
-class KVCache(torch.nn.Module):
-    def __init__(self):
-        super(KVCache, self).__init__()
-        self.cache = None
-        self.inp_seq_len = -1
-
-    def allocate(self, inp_seq_len, dtype, device, shape):
-        if self.cache is None or self.cache.shape != shape:
-            self.inp_seq_len = inp_seq_len
-            self.cache = torch.zeros(shape, dtype=dtype, device=device)
-        else:
-            assert (
-                self.inp_seq_len == inp_seq_len
-            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
-            self.cache.fill_(0)
-
-    def update(self, prev, cur, dim, idx, inp_seq_len):
-        orig_cur = cur
-        if prev.shape == cur.shape:
-            prev.copy_(cur)
-            return orig_cur
-        if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
-            # Initialize
-            prev[:, :, :inp_seq_len, :].copy_(cur)
-            return orig_cur
-        assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
-        if idx is not None:
-            prev.index_copy_(dim, idx - 1, cur)
-            return prev
-        else:
-            return torch.cat((prev, cur), dim=dim)
-
-    def get_shape(self):
-        if self.cache is None:
-            return None
-        return self.cache.shape
-
-    def forward(self, cur, dim, idx):
-        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
-
-
-class GaudiQwen2Attention(Qwen2Attention):
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-
-        self.matmul_qk = Matmul()
-        self.matmul_av = Matmul()
-        self.k_cache = KVCache()
-        self.v_cache = KVCache()
-        self.inp_seq_len = -1
-        self.norm_factor = 1.0 / math.sqrt(self.head_dim)
-        self.block_size = 4096
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim)
-        device = self.k_proj.weight.device
-        dtype = self.config.torch_dtype
-        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
-
-    def update_sincos_cache(self, seq_len):
-        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
-        # This helps in avoiding creation of these caches during actual model forward pass and
-        # reduce memory consumption and improve performance.
-        if seq_len > self.max_position_embeddings:
-            self.max_position_embeddings = seq_len
-            _, _ = self.rotary_emb(self.k_proj.weight, seq_len=seq_len)
-
-    def reorder(self, tensor, beam_idx, dim_a, dim_b):
-        updated = tensor.index_select(0, beam_idx)
-        tensor.copy_(updated)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        if self.k_cache.cache is None:
-            return (None, None)
-
-        head_dim = self.k_cache.cache.size(-1)
-        seq_length = self.k_cache.cache.size(-2)
-        self.reorder(self.k_cache.cache, beam_idx, seq_length, head_dim)
-        self.reorder(self.v_cache.cache, beam_idx, seq_length, head_dim)
-        return (self.k_cache.cache.shape, self.v_cache.cache.shape)
-
-    def gaudi_flash_attn_v1(self, query_layer, key_layer, value_layer, attention_mask, dropout_rate, q_block_size):
-        """
-        Gaudi version of Flash Attention V1 to support long sequence at prompt phase
-        Causal mask is not supported in this optimization
-        """
-        q_len = query_layer.size(-2)
-        q_tiles = (q_len // q_block_size) if (q_len % q_block_size == 0) else math.ceil(q_len / q_block_size)
-        q_padding = q_tiles * q_block_size - q_len
-        query_layer = F.pad(query_layer, (0, 0, 0, q_padding), "constant", 0)
-        if attention_mask is not None:
-            attention_mask = F.pad(attention_mask, (0, 0, 0, q_padding), "constant", -10000.0)
-
-        row_o_list = []
-        for i in range(q_tiles):
-            s, e = i * q_block_size, (i + 1) * q_block_size
-            row_q = query_layer[:, :, s:e, :]
-            row_mask = attention_mask[:, :, s:e, :]
-            attn_output_partial = FusedSDPA.apply(row_q, key_layer, value_layer, row_mask, dropout_rate, False, None)
-            row_o_list.append(attn_output_partial)
-        attn_output = torch.cat(row_o_list, dim=-2)
-
-        if q_padding != 0:
-            attn_output = attn_output[:, :, :-q_padding, :]
-
-        return attn_output
-
-    def pre_attn_forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        cache_idx: int = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """
-        The only differences are:
-        - add new args token_idx
-        - optimize KV cache
-        - add new args attn_softmax_bf16
-        - add new args reuse_cache
-        - add new args use_flash_attention
-        - add new arg flash_attention_recompute
-        """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if token_idx is None:
-                if hasattr(past_key_value, "get_usable_length"):
-                    kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-                else:
-                    kv_seq_len += past_key_value[0].shape[-2]
-            else:
-                if reuse_cache:
-                    kv_seq_len = past_key_value[0][-2]
-                else:
-                    kv_seq_len = past_key_value[0].shape[-2]
-
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
-
-        if use_cache:
-            # reuse k, v, self_attention
-            if reuse_cache:
-                key_states = self.k_cache(key_states, 2, token_idx)
-                value_states = self.v_cache(value_states, 2, token_idx)
-                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
-            else:
-                if past_key_value is None:
-                    past_key = torch.zeros(key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device)
-                    past_value = torch.zeros(
-                        key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device
-                    )
-                    past_key_value = (past_key, past_value)
-                key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
-                value_states = self.v_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
-                if token_idx is None:
-                    past_key_value = (key_states, value_states)
-
-            if cache_idx is not None and q_len == 1:
-                key_states = key_states[:, :, :cache_idx, :]
-                value_states = value_states[:, :, :cache_idx, :]
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, :, :, :cache_idx]
-                kv_seq_len = key_states.shape[-2]
-        else:
-            past_key_value = None
-
-        if use_flash_attention and FusedSDPA:
-            import habana_frameworks.torch.hpu as ht
-
-            if q_len == 1:
-                # next token
-                with ht.sdp_kernel(enable_recompute=False):
-                    attn_output = FusedSDPA.apply(
-                        query_states, key_states, value_states, attention_mask, 0.0, False, None
-                    )
-            else:
-                # first token
-                if flash_attention_causal_mask:
-                    # causal masking on first token requires inputs to be of the same length
-                    with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
-                        attn_output = FusedSDPA.apply(query_states, key_states, value_states, None, 0.0, True, None)
-                else:
-                    with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
-                        if q_len > 8192:
-                            attn_output = self.gaudi_flash_attn_v1(
-                                query_states, key_states, value_states, attention_mask, 0.0, self.block_size
-                            )
-                            htcore.mark_step()
-                        else:
-                            attn_output = FusedSDPA.apply(
-                                query_states, key_states, value_states, attention_mask, 0.0, False, None
-                            )
-
-        else:
-            query_states, key_states, value_states, attention_mask = gaudi_qwen2_repeat_kv(
-                query_states, key_states, value_states, attention_mask, self.num_key_value_groups
-            )
-
-            attn_weights = self.matmul_qk(query_states, key_states.transpose(-2, -1)) / math.sqrt(self.head_dim)
-
-            if attention_mask is not None:  # no matter the length, we just slice it
-                causal_mask = attention_mask
-                if cache_position is not None:
-                    causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
-                attn_weights = attn_weights + causal_mask
-
-            if attn_softmax_bf16:
-                attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=query_states.dtype)
-            else:
-                # upcast attention to fp32
-                attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
-                    query_states.dtype
-                )
-            attn_weights = torch.nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-            attn_output = self.matmul_av(attn_weights, value_states)
-            attn_output = attn_output.reshape(bsz, -1, q_len, self.head_dim)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def attention_all_reduce(self, attn_output):
-        if hasattr(self.o_proj, "all_reduce"):
-            self.o_proj.all_reduce(attn_output)
-
-    def post_attn_forward(self, attn_output):
-        if hasattr(self.o_proj, "post_all_reduce"):
-            self.o_proj.post_all_reduce(attn_output)
-        return attn_output
-
-
-class GaudiQwen2DecoderLayer(Qwen2DecoderLayer):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super(Qwen2DecoderLayer, self).__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = GaudiQwen2Attention(config, layer_idx)
-
-        self.mlp = GaudiQwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return self.self_attn.reorder_kv_cache(beam_idx)
-
-    def update_sincos_cache(self, seq_len):
-        self.self_attn.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        cache_idx: int = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        residual = hidden_states
-
-        hidden_states, self_attn_weights, present_key_value = self.pre_attn(
-            hidden_states,
-            attention_mask,
-            position_ids,
-            past_key_value,
-            output_attentions,
-            use_cache,
-            cache_position,
-            token_idx,
-            attn_softmax_bf16,
-            reuse_cache,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-            cache_idx=cache_idx,
-        )
-        self.self_attn.attention_all_reduce(hidden_states)
-        hidden_states, residual = self.post_attn_pre_mlp(hidden_states, residual)
-        self.mlp.mlp_all_reduce(hidden_states)
-        hidden_states = self.post_mlp(hidden_states, residual)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-    def pre_attn(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        cache_idx: int = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, attn_weights, present_key_value = self.self_attn.pre_attn_forward(
-            hidden_states,
-            attention_mask,
-            position_ids,
-            past_key_value,
-            output_attentions,
-            use_cache,
-            cache_position,
-            token_idx,
-            attn_softmax_bf16,
-            reuse_cache,
-            use_flash_attention,
-            flash_attention_recompute,
-            flash_attention_causal_mask,
-            cache_idx=cache_idx,
-        )
-        return hidden_states, attn_weights, present_key_value
-
-    def post_attn_pre_mlp(self, hidden_states, residual):
-        hidden_states = self.self_attn.post_attn_forward(hidden_states)
-
-        if self.training:
-            hidden_states = hidden_states + residual
-            residual = hidden_states
-        else:
-            residual.add_(hidden_states)
-            hidden_states = residual
-
-        hidden_states = self.post_attention_layernorm(hidden_states)
-
-        hidden_states = self.mlp.pre_mlp_forward(hidden_states)
-        return hidden_states, residual
-
-    def post_mlp(self, hidden_states, residual):
-        hidden_states = self.mlp.post_mlp_forward(hidden_states)
-
-        if self.training:
-            hidden_states = hidden_states + residual
-        else:
-            residual.add_(hidden_states)
-            hidden_states = residual
-
-        return hidden_states
-
-
-class GaudiQwen2Model(Qwen2Model):
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        for layer in self.layers:
-            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers)
-
-    def update_sincos_cache(self, seq_len):
-        for layer in self.layers:
-            layer.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        self._attn_implementation = "eager"
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        use_new_cache = False  # Ignoring new Cache path for HPU
-        past_seen_tokens = 0
-
-        if past_key_values is not None and use_cache:  # kept for BC (cache positions)
-            if reuse_cache:
-                past_seen_tokens = past_key_values[0][0][2]
-            else:
-                if use_new_cache:
-                    use_legacy_cache = not isinstance(past_key_values, Cache)
-                    if use_legacy_cache:
-                        past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                    past_seen_tokens = past_key_values.get_usable_length(seq_length)
-                else:
-                    past_seen_tokens = past_key_values[0][0].shape[2]
-
-        if position_ids is None:
-            position_ids = torch.arange(
-                past_seen_tokens, seq_length + past_seen_tokens, dtype=torch.long, device=inputs_embeds.device
-            )
-            position_ids = position_ids.unsqueeze(0)
-        cache_position = None
-
-        # HPU specific mask generation
-        attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-            attention_mask,
-            input_ids.shape if input_ids is not None else (batch_size, seq_length),
-            inputs_embeds,
-            past_seen_tokens,
-        )
-        # embed positions
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if not use_new_cache else None
-
-        if lazy_mode:
-            htcore.mark_step()
-
-        for layer_idx, decoder_layer in enumerate(self.layers):
-            if (
-                lazy_mode
-                and not self.training
-                and (torch.distributed.is_initialized() is False or torch.distributed.get_world_size() == 1)
-            ):
-                htcore.mark_step()
-
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    None,
-                    attn_softmax_bf16,
-                    False,
-                    use_flash_attention,
-                    flash_attention_recompute,
-                    flash_attention_causal_mask,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=None if past_key_values is None else past_key_values[layer_idx],
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    token_idx=token_idx,
-                    attn_softmax_bf16=attn_softmax_bf16,
-                    reuse_cache=reuse_cache,
-                    use_flash_attention=use_flash_attention,
-                    flash_attention_recompute=flash_attention_recompute,
-                    flash_attention_causal_mask=flash_attention_causal_mask,
-                    cache_idx=cache_idx,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
-            )
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class GaudiQwen2ForCausalLM(Qwen2ForCausalLM):
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
-        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
-
-    def reorder_kv_cache(self, beam_idx: torch.LongTensor):
-        return self.model.reorder_kv_cache(beam_idx)
-
-    def update_sincos_cache(self, seq_len):
-        self.model.update_sincos_cache(seq_len)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        token_idx: Optional[torch.Tensor] = None,
-        trim_logits: Optional[bool] = False,
-        attn_softmax_bf16: Optional[bool] = False,
-        reuse_cache: Optional[bool] = False,
-        use_flash_attention: Optional[bool] = False,
-        flash_attention_recompute: Optional[bool] = False,
-        flash_attention_causal_mask: Optional[bool] = False,
-        cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if not hasattr(self.config, "_attn_implementation"):
-            setattr(self.config, "_attn_implementation", "eager")
-        else:
-            self.config._attn_implementation = "eager"
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            cache_position=cache_position,
-            token_idx=token_idx,
-            attn_softmax_bf16=attn_softmax_bf16,
-            reuse_cache=reuse_cache,
-            use_flash_attention=use_flash_attention,
-            flash_attention_recompute=flash_attention_recompute,
-            flash_attention_causal_mask=flash_attention_causal_mask,
-            cache_idx=cache_idx,
-            lazy_mode=lazy_mode,
-        )
-
-        hidden_states = outputs[0]
-        _, seq_len, _ = hidden_states.shape
-        if seq_len > 1 and trim_logits and not self.training:
-            if token_idx is not None:
-                hidden_states = hidden_states.index_select(1, token_idx - 1)
-            else:
-                hidden_states = hidden_states[:, -1, :]
-
-        logits = self.lm_head(hidden_states).float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = torch.nn.CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, token_idx=None, **kwargs
-    ):
-        past_length = 0
-
-        reuse_cache = kwargs.get("reuse_cache")
-        if past_key_values is not None:
-            if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-            else:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                    max_cache_length = past_key_values.get_max_length()
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-        elif reuse_cache and token_idx is not None:
-            # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass
-            input_ids = input_ids[:, :token_idx]
-            attention_mask = attention_mask[:, :token_idx]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        cache_position = None
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids.contiguous(),
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-                "trim_logits": kwargs.get("trim_logits"),
-                "attn_softmax_bf16": kwargs.get("attn_softmax_bf16"),
-                "reuse_cache": reuse_cache,
-                "use_flash_attention": kwargs.get("use_flash_attention"),
-                "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
-                "flash_attention_causal_mask": kwargs.get("flash_attention_causal_mask"),
-                "cache_idx": kwargs.get("cache_idx"),
-                "lazy_mode": kwargs.get("lazy_mode"),
-            }
-        )
-        return model_inputs
-
-
-def apply_customized_rope(q, k, cos, sin, position_ids):
-    if q.device.type == "hpu" and FusedRoPE:
-        # TODO: remove `.clone()` when it is fixed in SynapseAI
-        return FusedRoPE.apply(
-            q, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-        ), FusedRoPE.apply(
-            k, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids
-        )
-    else:
-        return apply_rotary_pos_emb(q, k, cos, sin, position_ids)
diff --git a/optimum/habana/transformers/models/seamless_m4t/__init__.py b/optimum/habana/transformers/models/seamless_m4t/__init__.py
deleted file mode 100644
index 7bd87abee8..0000000000
--- a/optimum/habana/transformers/models/seamless_m4t/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .modeling_seamless_m4t import (
-    gaudi_SeamlessM4TAttention_forward,
-    gaudi_SeamlessM4TCodeHifiGan_get_output_hifigan_lengths,
-    gaudi_SeamlessM4TDecoder_forward,
-    gaudi_SeamlessM4TDecoderLayer_forward,
-    gaudi_SeamlessM4TForTextToSpeech_forward,
-    gaudi_SeamlessM4TForTextToSpeech_generate,
-    gaudi_SeamlessM4TForTextToSpeech_prepare_inputs_for_generation,
-    gaudi_SeamlessM4TTextToUnitForConditionalGeneration_forward,
-    gaudi_SeamlessM4TTextToUnitForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_SeamlessM4TTextToUnitModel_forward,
-)
diff --git a/optimum/habana/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/optimum/habana/transformers/models/seamless_m4t/modeling_seamless_m4t.py
deleted file mode 100644
index d4728c30f0..0000000000
--- a/optimum/habana/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ /dev/null
@@ -1,866 +0,0 @@
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput,
-    Seq2SeqModelOutput,
-)
-from transformers.models.seamless_m4t.modeling_seamless_m4t import (
-    SeamlessM4TForTextToSpeech,
-    SeamlessM4TGenerationOutput,
-    _compute_new_attention_mask,
-    format_speech_generation_kwargs,
-    shift_tokens_right,
-)
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_SeamlessM4TAttention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """
-    Copied from SeamlessM4TAttention.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - add token_idx args
-    """
-
-    # if encoder_hidden_states are provided this layer is used as a cross-attention layer
-    # for the decoder
-    is_cross_attention = encoder_hidden_states is not None
-
-    bsz, tgt_len, _ = hidden_states.size()
-
-    # get query proj
-    query_states = self.q_proj(hidden_states) * self.scaling
-    # get key, value proj
-    # `past_key_value[0].shape[2] == encoder_hidden_states.shape[1]`
-    # is checking that the `sequence_length` of the `past_key_value` is the same as
-    # the provided `encoder_hidden_states` to support prefix tuning
-    if (
-        is_cross_attention
-        and past_key_value is not None
-        and past_key_value[0].shape[2] == encoder_hidden_states.shape[1]
-    ):
-        # reuse k,v, cross_attentions
-        key_states = past_key_value[0]
-        value_states = past_key_value[1]
-    elif is_cross_attention:
-        # cross_attentions
-        key_states = self._shape(self.k_proj(encoder_hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(encoder_hidden_states), -1, bsz)
-    elif past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        if token_idx is None:
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            past_key_value[0].index_copy_(2, token_idx - 1, key_states)
-            past_key_value[1].index_copy_(2, token_idx - 1, value_states)
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-    else:
-        # self_attention
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-    if self.is_decoder:
-        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-        # Further calls to cross_attention layer can then reuse all cross-attention
-        # key/value_states (first "if" case)
-        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-        # all previous decoder key/value_states. Further calls to uni-directional self-attention
-        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-        # if encoder bi-directional self-attention `past_key_value` is always `None`
-        past_key_value = (key_states, value_states)
-
-    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-    key_states = key_states.reshape(*proj_shape)
-    value_states = value_states.reshape(*proj_shape)
-
-    src_len = key_states.size(1)
-    attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-    if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-        raise ValueError(
-            f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-            f" {attn_weights.size()}"
-        )
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-            )
-        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-    if output_attentions:
-        # this operation is a bit awkward, but it's required to
-        # make sure that attn_weights keeps its gradient.
-        # In order to do so, attn_weights have to be reshaped
-        # twice and have to be reused in the following
-        attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-    else:
-        attn_weights_reshaped = None
-
-    attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-    attn_output = torch.bmm(attn_probs, value_states)
-
-    if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-    attn_output = attn_output.transpose(1, 2)
-
-    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-    # partitioned across GPUs when using tensor-parallelism.
-    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-    attn_output = self.out_proj(attn_output)
-
-    return attn_output, attn_weights_reshaped, past_key_value
-
-
-def gaudi_SeamlessM4TDecoderLayer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,
-    use_cache: Optional[bool] = True,
-    token_idx: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    """
-    Copied from SeamlessM4TDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - add token_idx args
-    """
-    residual = hidden_states
-    hidden_states = self.self_attn_layer_norm(hidden_states)
-
-    # Self Attention
-    # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-    # add present self-attn cache to positions 1,2 of present_key_value tuple
-    hidden_states, self_attn_weights, present_key_value = self.self_attn(
-        hidden_states=hidden_states,
-        past_key_value=self_attn_past_key_value,
-        attention_mask=attention_mask,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    hidden_states = self.attn_dropout(hidden_states)
-    hidden_states = residual + hidden_states
-
-    # Cross-Attention Block
-    cross_attn_present_key_value = None
-    cross_attn_weights = None
-    if encoder_hidden_states is not None:
-        residual = hidden_states
-        hidden_states = self.cross_attention_layer_norm(hidden_states)
-
-        # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-        cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-
-        hidden_states, cross_attn_weights, cross_attn_present_key_value = self.cross_attention(
-            hidden_states=hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            past_key_value=cross_attn_past_key_value,
-            attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = self.attn_dropout(hidden_states)
-        hidden_states = residual + hidden_states
-
-        # add cross-attn to positions 3,4 of present_key_value tuple
-        present_key_value += cross_attn_present_key_value
-
-    # Fully Connected
-    residual = hidden_states
-
-    hidden_states = self.ffn_layer_norm(hidden_states)
-
-    hidden_states = self.ffn(hidden_states)
-    hidden_states = self.ffn_dropout(hidden_states)
-
-    hidden_states = residual + hidden_states
-
-    outputs = (hidden_states, present_key_value)
-
-    if output_attentions:
-        outputs += (self_attn_weights, cross_attn_weights)
-
-    return outputs
-
-
-def gaudi_SeamlessM4TDecoder_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    encoder_attention_mask: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-    """
-    Copied from SeamlessM4TDecoder.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - add token_idx args
-    """
-
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-    elif input_ids is not None:
-        input = input_ids
-        input_shape = input.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-        input = inputs_embeds[:, :, -1]
-    else:
-        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-    # past_key_values_length
-    past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
-    attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-        attention_mask, input_shape, inputs_embeds, past_key_values_length
-    )
-
-    # expand encoder attention mask
-    if encoder_hidden_states is not None and encoder_attention_mask is not None:
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        encoder_attention_mask = _prepare_4d_attention_mask(
-            encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-        )
-
-    # embed positions
-    if past_key_values_length != 0 and token_idx is not None:
-        past_key_values_length = token_idx - 1
-    positions = self.embed_positions(input, past_key_values_length=past_key_values_length)
-
-    hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
-
-    hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-    next_decoder_cache = () if use_cache else None
-
-    for idx, decoder_layer in enumerate(self.layers):
-        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        if self.training:
-            dropout_probability = torch.rand([])
-            if dropout_probability < self.layerdrop:
-                continue
-
-        past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                None,
-                output_attentions,
-                use_cache,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                token_idx=token_idx,
-            )
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache += (layer_outputs[1],)
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[2],)
-
-            if encoder_hidden_states is not None:
-                all_cross_attentions += (layer_outputs[3],)
-
-    hidden_states = self.layer_norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = next_decoder_cache if use_cache else None
-    if not return_dict:
-        return tuple(
-            v
-            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
-            if v is not None
-        )
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-        cross_attentions=all_cross_attentions,
-    )
-
-
-def gaudi_SeamlessM4TTextToUnitModel_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    decoder_input_ids: Optional[torch.LongTensor] = None,
-    decoder_attention_mask: Optional[torch.LongTensor] = None,
-    encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
-    """
-    Copied from SeamlessM4TTextToUnitModel.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - add token_idx args
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if encoder_outputs is None:
-        encoder_outputs = self.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-    # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-        encoder_outputs = BaseModelOutput(
-            last_hidden_state=encoder_outputs[0],
-            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-        )
-
-    # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-    decoder_outputs = self.decoder(
-        input_ids=decoder_input_ids,
-        attention_mask=decoder_attention_mask,
-        encoder_hidden_states=encoder_outputs[0],
-        encoder_attention_mask=attention_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=decoder_inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        token_idx=token_idx,
-    )
-
-    if not return_dict:
-        return decoder_outputs + encoder_outputs
-
-    return Seq2SeqModelOutput(
-        last_hidden_state=decoder_outputs.last_hidden_state,
-        past_key_values=decoder_outputs.past_key_values,
-        decoder_hidden_states=decoder_outputs.hidden_states,
-        decoder_attentions=decoder_outputs.attentions,
-        cross_attentions=decoder_outputs.cross_attentions,
-        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-        encoder_hidden_states=encoder_outputs.hidden_states,
-        encoder_attentions=encoder_outputs.attentions,
-    )
-
-
-def gaudi_SeamlessM4TTextToUnitForConditionalGeneration_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    decoder_input_ids: Optional[torch.LongTensor] = None,
-    decoder_attention_mask: Optional[torch.LongTensor] = None,
-    encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
-    """
-    Copied from SeamlessM4TTextToUnitForConditionalGeneration.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - add token_idx args
-    """
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if labels is not None:
-        if use_cache:
-            logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
-        use_cache = False
-        if decoder_input_ids is None and decoder_inputs_embeds is None:
-            decoder_input_ids = shift_tokens_right(
-                labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id
-            )
-
-    outputs = self.model(
-        input_ids,
-        attention_mask=attention_mask,
-        decoder_input_ids=decoder_input_ids,
-        encoder_outputs=encoder_outputs,
-        decoder_attention_mask=decoder_attention_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        decoder_inputs_embeds=decoder_inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        token_idx=token_idx,
-    )
-    lm_logits = self.lm_head(outputs[0])
-
-    masked_lm_loss = None
-    if labels is not None:
-        loss_fct = CrossEntropyLoss()
-        labels = labels.to(lm_logits.device)
-        masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-
-    if not return_dict:
-        output = (lm_logits,) + outputs[1:]
-        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-    return Seq2SeqLMOutput(
-        loss=masked_lm_loss,
-        logits=lm_logits,
-        past_key_values=outputs.past_key_values,
-        decoder_hidden_states=outputs.decoder_hidden_states,
-        decoder_attentions=outputs.decoder_attentions,
-        cross_attentions=outputs.cross_attentions,
-        encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-        encoder_hidden_states=outputs.encoder_hidden_states,
-        encoder_attentions=outputs.encoder_attentions,
-    )
-
-
-def gaudi_SeamlessM4TTextToUnitForConditionalGeneration_prepare_inputs_for_generation(
-    self,
-    decoder_input_ids,
-    past_key_values=None,
-    attention_mask=None,
-    use_cache=None,
-    encoder_outputs=None,
-    **kwargs,
-):
-    """
-    Copied from SeamlessM4TTextToUnitForConditionalGeneration.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - add token_idx args
-    """
-    token_idx = kwargs.get("token_idx", None)
-    # cut decoder_input_ids if past is used
-    if past_key_values is not None:
-        if token_idx is None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-        else:
-            decoder_input_ids = torch.index_select(decoder_input_ids, 1, token_idx - 1)
-
-    return {
-        "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-        "encoder_outputs": encoder_outputs,
-        "past_key_values": past_key_values,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "use_cache": use_cache,
-        "token_idx": token_idx,
-        "decoder_attention_mask": kwargs.get("decoder_attention_mask", None),
-    }
-
-
-def gaudi_SeamlessM4TCodeHifiGan_get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]):
-    """
-    Copied from SeamlessM4TCodeHifiGan._get_output_hifigan_lengths: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - fix torch.div issue
-    """
-
-    def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
-        # 1D convolutional layer output length formula taken
-        # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-        return (
-            torch.div(input_length.item() + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor")
-            + 1
-        )
-
-    def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
-        return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
-
-    # conv_pre
-    input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
-
-    # upsampler
-    for i, (upsample_rate, kernel_size) in enumerate(
-        zip(self.config.upsample_rates, self.config.upsample_kernel_sizes)
-    ):
-        input_lengths = _transpose_conv_out_length(
-            input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2
-        )
-
-    # resblock
-    for i in range(len(self.config.upsample_rates)):
-        for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes):
-            for dil in dilation:
-                input_lengths = _conv_out_length(
-                    input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil
-                )
-
-            for dil in dilation:
-                input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1)
-
-    # conv_post
-    input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
-
-    return input_lengths
-
-
-def gaudi_SeamlessM4TForTextToSpeech_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    decoder_input_ids: Optional[torch.LongTensor] = None,
-    decoder_attention_mask: Optional[torch.LongTensor] = None,
-    encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
-    """
-    Copied from SeamlessM4TForTextToSpeech.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - add token_idx args
-    """
-    if labels is not None:
-        if use_cache:
-            logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
-        use_cache = False
-        if decoder_input_ids is None and decoder_inputs_embeds is None:
-            decoder_input_ids = shift_tokens_right(
-                labels, self.config.pad_token_id, self.config.decoder_start_token_id
-            )
-
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if encoder_outputs is None:
-        # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
-        logger.warning(
-            "This is the same forward method as `SeamlessM4TForTextToText`."
-            "It doesn't use the text-to-unit model `SeamlessM4TTextToUnitForConditionalGeneration`."
-            "If you want to generate speech, use the `.generate` method."
-        )
-        encoder_outputs = self.text_encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-    # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-        encoder_outputs = BaseModelOutput(
-            last_hidden_state=encoder_outputs[0],
-            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-        )
-
-    encoder_attention_mask = attention_mask
-
-    # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-    decoder_outputs = self.text_decoder(
-        input_ids=decoder_input_ids,
-        attention_mask=decoder_attention_mask,
-        encoder_hidden_states=encoder_outputs[0],
-        encoder_attention_mask=encoder_attention_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=decoder_inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        token_idx=token_idx,
-    )
-
-    lm_logits = self.lm_head(decoder_outputs[0])
-
-    masked_lm_loss = None
-    if labels is not None:
-        loss_fct = CrossEntropyLoss()
-        labels = labels.to(lm_logits.device)
-        masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-
-    if not return_dict:
-        outputs = decoder_outputs + encoder_outputs
-        output = (lm_logits,) + outputs[1:]
-        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-    return Seq2SeqLMOutput(
-        loss=masked_lm_loss,
-        logits=lm_logits,
-        past_key_values=decoder_outputs.past_key_values,
-        decoder_hidden_states=decoder_outputs.hidden_states,
-        decoder_attentions=decoder_outputs.attentions,
-        cross_attentions=decoder_outputs.cross_attentions,
-        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-        encoder_hidden_states=encoder_outputs.hidden_states,
-        encoder_attentions=encoder_outputs.attentions,
-    )
-
-
-@torch.no_grad()
-def gaudi_SeamlessM4TForTextToSpeech_generate(
-    self,
-    input_ids: Optional[torch.Tensor] = None,
-    return_intermediate_token_ids: Optional[bool] = None,
-    tgt_lang: Optional[str] = None,
-    spkr_id: Optional[int] = 0,
-    **kwargs,
-) -> Union[torch.Tensor, SeamlessM4TGenerationOutput]:
-    """
-    Copied from SeamlessM4TForTextToSpeech.generate: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - delete pad id for unit_ids output
-    """
-    batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
-
-    if tgt_lang is None:
-        raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
-    else:
-        # also accept __xxx__
-        tgt_lang = tgt_lang.replace("__", "")
-        for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
-            lang_code_to_id = getattr(self.generation_config, key, None)
-            if lang_code_to_id is None:
-                raise ValueError(
-                    f"""This model generation config doesn't have a `{key}` key which maps the target language
-                    to the right token id. Make sure to load the right generation config."""
-                )
-            elif tgt_lang not in lang_code_to_id:
-                raise ValueError(
-                    f"""`tgt_lang={tgt_lang}` is not supported by this model.
-                Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
-                more languages for text translation than for speech synthesis."""
-                )
-    if kwargs.get("hpu_graphs", True):
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        if not hasattr(self, "clear_cache"):
-            self = wrap_in_hpu_graph(self)
-        if not hasattr(self.t2u_model, "clear_cache"):
-            self.t2u_model = wrap_in_hpu_graph(self.t2u_model)
-        if not hasattr(self.vocoder, "clear_cache"):
-            self.vocoder = wrap_in_hpu_graph(self.vocoder)
-
-    kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
-    kwargs_text["output_hidden_states"] = True
-    kwargs_text["return_dict_in_generate"] = True
-    kwargs_text["output_scores"] = True
-
-    text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
-
-    # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
-    text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-    text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
-
-    kwargs_text["decoder_input_ids"] = text_decoder_input_ids
-
-    # first generation
-    text_generation_output = super(SeamlessM4TForTextToSpeech, self).generate(input_ids, **kwargs_text)
-    sequences = text_generation_output.sequences
-
-    # prepare second generation
-    num_return_sequences = len(sequences) // batch_size
-    attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
-
-    encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
-
-    # take care of num_return_sequences
-    # take most probable hidden states per batch of return_sequences
-    # (batch_size*num_return_sequences, ...) -> (batch_size,...)
-    if num_return_sequences > 1:
-        idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1)
-        idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1)
-        idx_most_probable_sequences_per_batch = (
-            idx_most_probable_sequences_per_batch + torch.arange(batch_size).to(self.device) * num_return_sequences
-        )
-        sequences = sequences[idx_most_probable_sequences_per_batch]
-
-    # get decoder last hidden state - must do a pass through the text decoder
-    t2u_input_embeds = self.text_decoder(
-        input_ids=sequences,
-        encoder_hidden_states=encoder_hidden_states,
-        encoder_attention_mask=attention_mask,
-    ).last_hidden_state
-
-    pad_token_id = self.generation_config.pad_token_id
-
-    # Compute new attention mask
-    seq_lens = (sequences != pad_token_id).int().sum(1)
-    t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
-    kwargs_speech["attention_mask"] = t2u_model_attention_mask
-
-    # Compute t2u decoder_input_ids
-    t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids")
-    t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang)
-    t2u_decoder_input_ids = torch.tensor([[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size).to(
-        self.device
-    )
-    kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids
-
-    # second generation
-    unit_ids = self.t2u_model.generate(inputs_embeds=t2u_input_embeds, **kwargs_speech)
-    seq_lens = (unit_ids != self.config.t2u_pad_token_id).int().sum(1)
-    unit_ids = unit_ids[:, 0:seq_lens]
-    output_unit_ids = unit_ids.detach().clone()
-
-    # get rid of t2u_decoder_input_ids
-    unit_ids = unit_ids[:, kwargs_speech["decoder_input_ids"].shape[1] :]
-    # replace eos per pad
-    unit_ids[unit_ids == self.config.t2u_eos_token_id] = self.config.t2u_pad_token_id
-    # offset of control symbols
-    unit_ids = torch.where(unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset)
-
-    vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
-    vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
-
-    spkr_id = torch.tensor([[spkr_id]] * len(unit_ids)).to(self.device)
-
-    waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id)
-
-    if return_intermediate_token_ids:
-        return SeamlessM4TGenerationOutput(
-            waveform=waveform,
-            waveform_lengths=waveform_lengths,
-            sequences=sequences,
-            unit_sequences=output_unit_ids,
-        )
-
-    return waveform, waveform_lengths
-
-
-def gaudi_SeamlessM4TForTextToSpeech_prepare_inputs_for_generation(
-    self,
-    decoder_input_ids,
-    past_key_values=None,
-    attention_mask=None,
-    use_cache=None,
-    encoder_outputs=None,
-    **kwargs,
-):
-    """
-    Copied from SeamlessM4TForTextToSpeech.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
-    The only differences are:
-    - add token_idx
-    """
-    token_idx = kwargs.get("token_idx", None)
-    # cut decoder_input_ids if past is used
-    if past_key_values is not None:
-        if token_idx is None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-        else:
-            decoder_input_ids = torch.index_select(decoder_input_ids, 1, token_idx - 1)
-
-    return {
-        "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-        "encoder_outputs": encoder_outputs,
-        "past_key_values": past_key_values,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "use_cache": use_cache,
-        "token_idx": token_idx,
-        "decoder_attention_mask": kwargs.get("decoder_attention_mask", None),
-    }
diff --git a/optimum/habana/transformers/models/speecht5/__init__.py b/optimum/habana/transformers/models/speecht5/__init__.py
deleted file mode 100644
index c0bc3d17bd..0000000000
--- a/optimum/habana/transformers/models/speecht5/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_speecht5 import (
-    gaudi_generate_speech,
-    gaudi_SpeechT5Attention_forward,
-    gaudi_SpeechT5Decoder_forward,
-    gaudi_SpeechT5DecoderLayer_forward,
-)
diff --git a/optimum/habana/transformers/models/speecht5/modeling_speecht5.py b/optimum/habana/transformers/models/speecht5/modeling_speecht5.py
deleted file mode 100644
index 07c4fa8a14..0000000000
--- a/optimum/habana/transformers/models/speecht5/modeling_speecht5.py
+++ /dev/null
@@ -1,523 +0,0 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
-from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet, SpeechT5PreTrainedModel
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_SpeechT5Attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    key_value_states: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    layer_head_mask: Optional[torch.Tensor] = None,
-    position_bias: Optional[torch.Tensor] = None,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """
-    Copied from SpeechT5Attention.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
-    The only differences are:
-    - add new args token_idx
-    """
-
-    # if key_value_states are provided this layer is used as a cross-attention layer
-    # for the decoder
-    is_cross_attention = key_value_states is not None
-
-    bsz, tgt_len, _ = hidden_states.size()
-
-    # get query proj
-    query_states = self.q_proj(hidden_states) * self.scaling
-    # get key, value proj
-    if is_cross_attention and past_key_value is not None:
-        # reuse k,v, cross_attentions
-        key_states = past_key_value[0]
-        value_states = past_key_value[1]
-    elif is_cross_attention:
-        # cross_attentions
-        key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-        value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-    elif past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        if token_idx is not None:
-            past_key_value[0].index_copy_(2, token_idx - 1, key_states)
-            past_key_value[1].index_copy_(2, token_idx - 1, value_states)
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        else:
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-    else:
-        # self_attention
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-    if self.is_decoder:
-        # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-        # Further calls to cross_attention layer can then reuse all cross-attention
-        # key/value_states (first "if" case)
-        # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-        # all previous decoder key/value_states. Further calls to uni-directional self-attention
-        # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-        # if encoder bi-directional self-attention `past_key_value` is always `None`
-        past_key_value = (key_states, value_states)
-
-    proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-    query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-    key_states = key_states.view(*proj_shape)
-    value_states = value_states.view(*proj_shape)
-
-    src_len = key_states.size(1)
-    attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-    if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-        raise ValueError(
-            f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-            f" {attn_weights.size()}"
-        )
-
-    # relative attention bias
-    if position_bias is not None:
-        reshape_q = query_states.contiguous().view(bsz * self.num_heads, -1, self.head_dim).transpose(0, 1)
-        rel_pos_bias = torch.matmul(reshape_q, position_bias.transpose(-2, -1))
-        rel_pos_bias = rel_pos_bias.transpose(0, 1).view(
-            bsz * self.num_heads, position_bias.size(0), position_bias.size(1)
-        )
-        attn_weights += rel_pos_bias
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-            )
-        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-    if layer_head_mask is not None:
-        if layer_head_mask.size() != (self.num_heads,):
-            raise ValueError(
-                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                f" {layer_head_mask.size()}"
-            )
-        attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-    if output_attentions:
-        # this operation is a bit awkward, but it's required to
-        # make sure that attn_weights keeps its gradient.
-        # In order to do so, attn_weights have to be reshaped
-        # twice and have to be reused in the following
-        attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-    else:
-        attn_weights_reshaped = None
-
-    attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-
-    attn_output = torch.bmm(attn_probs, value_states)
-
-    if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-    attn_output = attn_output.transpose(1, 2)
-
-    # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-    # partitioned aross GPUs when using tensor-parallelism.
-    attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-    attn_output = self.out_proj(attn_output)
-
-    return attn_output, attn_weights_reshaped, past_key_value
-
-
-def gaudi_SpeechT5DecoderLayer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    encoder_hidden_states: Optional[torch.Tensor] = None,
-    encoder_attention_mask: Optional[torch.Tensor] = None,
-    layer_head_mask: Optional[torch.Tensor] = None,
-    cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,
-    use_cache: Optional[bool] = True,
-    token_idx: Optional[torch.Tensor] = None,
-):
-    """
-    Copied from SpeechT5DecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
-    The only differences are:
-    - add token_idx in self-attention
-    """
-    residual = hidden_states
-
-    # Self Attention
-    # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-    # add present self-attn cache to positions 1,2 of present_key_value tuple
-    hidden_states, self_attn_weights, present_key_value = self.self_attn(
-        hidden_states=hidden_states,
-        past_key_value=self_attn_past_key_value,
-        attention_mask=attention_mask,
-        layer_head_mask=layer_head_mask,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    hidden_states = self.dropout(hidden_states)
-    hidden_states = residual + hidden_states
-    hidden_states = self.self_attn_layer_norm(hidden_states)
-
-    # Cross-Attention Block
-    cross_attn_present_key_value = None
-    cross_attn_weights = None
-    if encoder_hidden_states is not None:
-        residual = hidden_states
-
-        # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-        cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-        hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-            hidden_states=hidden_states,
-            key_value_states=encoder_hidden_states,
-            attention_mask=encoder_attention_mask,
-            layer_head_mask=cross_attn_layer_head_mask,
-            past_key_value=cross_attn_past_key_value,
-            output_attentions=output_attentions,
-        )
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = residual + hidden_states
-        hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-        # add cross-attn to positions 3,4 of present_key_value tuple
-        present_key_value = present_key_value + cross_attn_present_key_value
-
-    # Fully Connected
-    hidden_states = hidden_states + self.feed_forward(hidden_states)
-    hidden_states = self.final_layer_norm(hidden_states)
-
-    outputs = (hidden_states,)
-
-    if output_attentions:
-        outputs += (self_attn_weights, cross_attn_weights)
-
-    if use_cache:
-        outputs += (present_key_value,)
-
-    return outputs
-
-
-def gaudi_SpeechT5Decoder_forward(
-    self,
-    hidden_states: Optional[torch.FloatTensor] = None,
-    attention_mask: Optional[torch.LongTensor] = None,
-    encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    encoder_attention_mask: Optional[torch.LongTensor] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    cross_attn_head_mask: Optional[torch.Tensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-    """
-    Copied from SpeechT5Decoder.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
-    The only differences are:
-    - add token_idx args
-    - use _gaudi_prepare_4d_causal_attention_mask
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    input_shape = hidden_states.size()[:-1]
-
-    past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-    attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-        attention_mask, input_shape, hidden_states, past_key_values_length
-    )
-
-    # expand encoder attention mask
-    if encoder_hidden_states is not None and encoder_attention_mask is not None:
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        encoder_attention_mask = _prepare_4d_attention_mask(
-            encoder_attention_mask, hidden_states.dtype, tgt_len=input_shape[-1]
-        )
-
-    deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attentions = () if output_attentions else None
-    all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-    next_decoder_cache = () if use_cache else None
-
-    # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-    for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
-        if attn_mask is not None:
-            if attn_mask.size()[0] != (len(self.layers)):
-                raise ValueError(
-                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                    f" {head_mask.size()[0]}."
-                )
-
-    for idx, decoder_layer in enumerate(self.layers):
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        skip_the_layer = False
-        if self.training:
-            dropout_probability = torch.rand([])
-            skip_the_layer = dropout_probability < self.layerdrop
-        if skip_the_layer and not deepspeed_zero3_is_enabled:
-            continue
-
-        past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                head_mask[idx] if head_mask is not None else None,
-                cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
-                None,
-                output_attentions,
-                use_cache,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                token_idx=token_idx,
-            )
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
-
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (layer_outputs[1],)
-
-            if encoder_hidden_states is not None:
-                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    next_cache = next_decoder_cache if use_cache else None
-    if not return_dict:
-        return tuple(
-            v
-            for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions, all_cross_attentions]
-            if v is not None
-        )
-
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-        cross_attentions=all_cross_attentions,
-    )
-
-
-def gaudi_generate_speech(
-    model: SpeechT5PreTrainedModel,
-    input_values: torch.FloatTensor,
-    speaker_embeddings: Optional[torch.FloatTensor] = None,
-    attention_mask: Optional[torch.LongTensor] = None,
-    threshold: float = 0.5,
-    minlenratio: float = 0.0,
-    maxlenratio: float = 20.0,
-    vocoder: Optional[nn.Module] = None,
-    output_cross_attentions: bool = False,
-    return_output_lengths: bool = False,
-) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]:
-    """
-    Copied from _generate_speech: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/speecht5/modeling_speecht5.py
-    The only differences are:
-    - add hpu graph wrap
-    - add static shape support in kv-cache in _generate_speech
-    - disable speech_decoder_prenet_dropout to avoid variable output length
-    """
-    if speaker_embeddings is None:
-        raise ValueError(
-            """`speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
-                    the code snippet provided in this link:
-                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
-                    """
-        )
-    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-    if not hasattr(model.speecht5.encoder, "clear_cache"):
-        model.speecht5.encoder = wrap_in_hpu_graph(model.speecht5.encoder)
-    if not hasattr(model.speecht5.decoder.wrapped_decoder, "clear_cache"):
-        model.speecht5.decoder.wrapped_decoder = wrap_in_hpu_graph(model.speecht5.decoder.wrapped_decoder)
-    if not hasattr(model.speecht5.decoder.prenet, "clear_cache"):
-        model.speecht5.decoder.prenet = wrap_in_hpu_graph(model.speecht5.decoder.prenet)
-
-    if attention_mask is None:
-        encoder_attention_mask = 1 - (input_values == model.config.pad_token_id).int()
-    else:
-        encoder_attention_mask = attention_mask
-
-    bsz = input_values.size(0)
-    encoder_out = model.speecht5.encoder(
-        input_values=input_values,
-        attention_mask=encoder_attention_mask,
-        return_dict=True,
-    )
-
-    encoder_last_hidden_state = encoder_out.last_hidden_state
-
-    # downsample encoder attention mask
-    if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet):
-        encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask(
-            encoder_out[0].shape[1], encoder_attention_mask
-        )
-
-    maxlen = int(encoder_last_hidden_state.size(1) * maxlenratio / model.config.reduction_factor)
-    minlen = int(encoder_last_hidden_state.size(1) * minlenratio / model.config.reduction_factor)
-
-    # Start the output sequence with a mel spectrum that is all zeros.
-    output_sequence = encoder_last_hidden_state.new_zeros(bsz, 1, model.config.num_mel_bins)
-    output_sequence = torch.nn.functional.pad(output_sequence, (0, 0, 0, maxlen - 1), value=model.config.pad_token_id)
-    spectrogram = []
-    cross_attentions = []
-    past_key_values = None
-    idx = 0
-    result_spectrogram = {}
-    token_idx = torch.tensor(1, device=output_sequence.device)
-    attention_mask = torch.zeros((bsz, maxlen), dtype=torch.long, device=output_sequence.device)
-    while True:
-        idx += 1
-        attention_mask.index_fill_(1, token_idx - 1, 1)
-        # Run the decoder prenet on the entire output sequence.
-        decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
-        # Run the decoder layers on the last element of the prenet output.
-        decoder_out = model.speecht5.decoder.wrapped_decoder(
-            hidden_states=decoder_hidden_states
-            if past_key_values is None
-            else torch.index_select(decoder_hidden_states, 1, token_idx - 1),
-            attention_mask=attention_mask,
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=True,
-            output_attentions=output_cross_attentions,
-            return_dict=True,
-            token_idx=token_idx,
-        )
-
-        if output_cross_attentions:
-            cross_attentions.append(torch.cat(decoder_out.cross_attentions, dim=0))
-
-        last_decoder_output = decoder_out.last_hidden_state[:, 0:1, :].squeeze(1)
-        past_key_values = decoder_out.past_key_values
-        # Predict the new mel spectrum for this step in the sequence.
-        spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output)
-        spectrum = spectrum.view(bsz, model.config.reduction_factor, model.config.num_mel_bins)
-        spectrogram.append(spectrum)
-        output_sequence.index_copy_(1, token_idx, spectrum[:, -1, :].view(bsz, 1, model.config.num_mel_bins))
-        # Predict the probability that this is the stop token.
-        prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))
-        token_idx.add_(1)
-        # Finished when stop token or maximum length is reached.
-        if idx < minlen:
-            continue
-        else:
-            # If the generation loop is less than maximum length time, check the ones in the batch that have met
-            # the prob threshold. Otherwise, assume all have met thresholds and fill other spectrograms for the batch.
-            if idx < maxlen:
-                meet_thresholds = torch.sum(prob, dim=-1) >= threshold
-                meet_indexes = torch.where(meet_thresholds)[0].tolist()
-            else:
-                meet_indexes = range(len(prob))
-            meet_indexes = [i for i in meet_indexes if i not in result_spectrogram]
-            if len(meet_indexes) > 0:
-                spectrograms = torch.stack(spectrogram)
-                spectrograms = spectrograms.transpose(0, 1).flatten(1, 2)
-                spectrograms = model.speech_decoder_postnet.postnet(spectrograms)
-                for meet_index in meet_indexes:
-                    result_spectrogram[meet_index] = spectrograms[meet_index]
-            if len(result_spectrogram) >= bsz:
-                break
-
-    spectrograms = [result_spectrogram[i] for i in range(len(result_spectrogram))]
-    if not return_output_lengths:
-        spectrogram = spectrograms[0] if bsz == 1 else torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
-        if vocoder is not None:
-            outputs = vocoder(spectrogram)
-        else:
-            outputs = spectrogram
-        if output_cross_attentions:
-            cross_attentions = torch.cat(cross_attentions, dim=2)
-            if bsz > 1:
-                cross_attentions = cross_attentions.view(
-                    bsz, int(cross_attentions.size(0) / bsz), *cross_attentions.size()[-3:]
-                )
-            outputs = (outputs, cross_attentions)
-    else:
-        # batched return values should also include the spectrogram/waveform lengths
-        spectrogram_lengths = []
-        for i in range(bsz):
-            spectrogram_lengths.append(spectrograms[i].size(0))
-        if vocoder is None:
-            spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
-            outputs = (spectrograms, spectrogram_lengths)
-        else:
-            waveforms = []
-            spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
-            waveforms = vocoder(spectrograms)
-            waveform_lengths = [int(waveforms.size(1) / max(spectrogram_lengths)) * i for i in spectrogram_lengths]
-            outputs = (waveforms, waveform_lengths)
-        if output_cross_attentions:
-            cross_attentions = torch.cat(cross_attentions, dim=2)
-            cross_attentions = cross_attentions.view(
-                bsz, int(cross_attentions.size(0) / bsz), *cross_attentions.size()[-3:]
-            )
-            outputs = (*outputs, cross_attentions)
-    return outputs
diff --git a/optimum/habana/transformers/models/stablelm/__init__.py b/optimum/habana/transformers/models/stablelm/__init__.py
deleted file mode 100644
index 9ae5d787bb..0000000000
--- a/optimum/habana/transformers/models/stablelm/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_stablelm import (
-    GaudiStableLmDecoderLayer,
-    GaudiStableLmForCausalLM,
-    gaudi_stablelm_attention_forward,
-    gaudi_stablelm_model_forward,
-)
diff --git a/optimum/habana/transformers/models/stablelm/modeling_stablelm.py b/optimum/habana/transformers/models/stablelm/modeling_stablelm.py
deleted file mode 100644
index f53994a5b1..0000000000
--- a/optimum/habana/transformers/models/stablelm/modeling_stablelm.py
+++ /dev/null
@@ -1,486 +0,0 @@
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.stablelm.configuration_stablelm import StableLmConfig
-from transformers.models.stablelm.modeling_stablelm import (
-    StableLmAttention,
-    StableLmForCausalLM,
-    StableLmMLP,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_stablelm_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Cache] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """
-    Copied from StableLmAttention.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/stablelm/modeling_stablelm.py
-    The only differences are:
-    - add new args token_idx
-    - optimize KV cache
-    """
-    bsz, q_len, _ = hidden_states.size()
-
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-    if self.qk_layernorm:
-        query_states = self.q_layernorm(query_states)
-        key_states = self.k_layernorm(key_states)
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        if self.layer_idx is None:
-            raise ValueError(
-                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                "with a layer index."
-            )
-        if token_idx is not None and past_key_value.get_usable_length(kv_seq_len, self.layer_idx) > 0:
-            # When token_idx is used, static seq len = (input token len + max output token len)
-            kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        else:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-    # Partial rotary embedding
-    query_rot, query_pass = (
-        query_states[..., : self.rotary_emb.dim],
-        query_states[..., self.rotary_emb.dim :],
-    )
-    key_rot, key_pass = (
-        key_states[..., : self.rotary_emb.dim],
-        key_states[..., self.rotary_emb.dim :],
-    )
-    # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
-    query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
-
-    # [batch_size, seq_length, num_heads, head_dim]
-    query_states = torch.cat((query_rot, query_pass), dim=-1)
-    key_states = torch.cat((key_rot, key_pass), dim=-1)
-
-    if past_key_value is not None:
-        if token_idx is not None:
-            if 0 <= self.layer_idx < len(past_key_value.key_cache):
-                past_key_value.key_cache[self.layer_idx].index_copy_(2, token_idx - 1, key_states)
-                past_key_value.value_cache[self.layer_idx].index_copy_(2, token_idx - 1, value_states)
-                key_states = past_key_value.key_cache[self.layer_idx]
-                value_states = past_key_value.value_cache[self.layer_idx]
-            else:
-                past_key_value.key_cache.append(key_states)
-                past_key_value.value_cache.append(value_states)
-        else:
-            # Specific to RoPE models with partial rotation
-            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-    # Repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-        raise ValueError(
-            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-            f" {attn_weights.size()}"
-        )
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-            )
-        attn_weights = attn_weights + attention_mask
-
-    # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query_states.dtype)
-    attn_weights = self.attention_dropout(attn_weights)
-
-    attn_output = torch.matmul(attn_weights, value_states)
-
-    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-    attn_output = self.o_proj(attn_output)
-
-    if not output_attentions:
-        attn_weights = None
-
-    return attn_output, attn_weights, past_key_value
-
-
-class GaudiStableLmDecoderLayer(torch.nn.Module):
-    def __init__(self, config: StableLmConfig, layer_idx: int):
-        super().__init__()
-        self.use_parallel_residual = config.use_parallel_residual
-        self.hidden_size = config.hidden_size
-        self.self_attn = StableLmAttention(config, layer_idx=layer_idx)
-        self.mlp = StableLmMLP(config)
-        self.input_layernorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.post_attention_layernorm = None
-        if not self.use_parallel_residual:
-            self.post_attention_layernorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = torch.nn.Dropout(config.hidden_dropout)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Copied from StableLmDecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/stablelm/modeling_stablelm.py
-        The only differences are:
-        - add new args token_idx
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        self_attn_output, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            token_idx=token_idx,
-        )
-
-        # copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXLayer.forward
-        if self.use_parallel_residual:
-            # x = x + attn(ln1(x)) + mlp(ln1(x))
-            # Fully Connected
-            mlp_output = self.mlp(hidden_states)
-            mlp_output = self.dropout(mlp_output)
-            hidden_states = residual + self_attn_output + mlp_output
-        else:
-            # x = x + attn(ln1(x))
-            # x = x + mlp(ln2(x))
-            residual = residual + self_attn_output
-            # Fully Connected
-            mlp_output = self.mlp(self.post_attention_layernorm(residual))
-            mlp_output = self.dropout(mlp_output)
-            hidden_states = residual + mlp_output
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-def gaudi_stablelm_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from StableLmModel.forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/stablelm/modeling_stablelm.py
-    The only differences are:
-    - add new args token_idx
-    - replace _prepare_4d_causal_attention_mask with _gaudi_prepare_4d_causal_attention_mask
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-    elif input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-    seq_length_with_past = seq_length
-    past_key_values_length = 0
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    if use_cache:
-        use_legacy_cache = not isinstance(past_key_values, Cache)
-        if use_legacy_cache:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        if token_idx is None:
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-        )
-        position_ids = position_ids.unsqueeze(0)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-    # embed positions
-    if self._attn_implementation == "flash_attention_2":
-        # 2d mask is passed through the layers
-        attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-    else:
-        # 4d mask is passed through the layers
-        attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-    hidden_states = inputs_embeds
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = None
-
-    for decoder_layer in self.layers:
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                position_ids,
-                past_key_values,
-                output_attentions,
-                None,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                token_idx=token_idx,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = None
-    if use_cache:
-        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-class GaudiStableLmForCausalLM(StableLmForCausalLM):
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        """
-        Inherits from StableLmForCausalLM: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/stablelm/modeling_stablelm.py
-        The only differences are:
-        - add new args token_idx
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        """
-        Inherits from StableLmForCausalLM: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/stablelm/modeling_stablelm.py
-        The only differences are:
-        - add new args token_idx
-        - add token_idx into model_inputs
-        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-        """
-        token_idx = kwargs.get("token_idx", None)
-        if past_key_values is not None:
-            if token_idx is None:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                    max_cache_length = past_key_values.get_max_length()
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-            else:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
diff --git a/optimum/habana/transformers/models/starcoder2/__init__.py b/optimum/habana/transformers/models/starcoder2/__init__.py
deleted file mode 100644
index c4ac4dfc22..0000000000
--- a/optimum/habana/transformers/models/starcoder2/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .modeling_starcoder2 import (
-    GaudiStarcoder2DecoderLayer,
-    GaudiStarcoder2ForCausalLM,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
-)
diff --git a/optimum/habana/transformers/models/starcoder2/modeling_starcoder2.py b/optimum/habana/transformers/models/starcoder2/modeling_starcoder2.py
deleted file mode 100644
index c402833319..0000000000
--- a/optimum/habana/transformers/models/starcoder2/modeling_starcoder2.py
+++ /dev/null
@@ -1,472 +0,0 @@
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.models.starcoder2.modeling_starcoder2 import (
-    Starcoder2Attention,
-    Starcoder2Config,
-    Starcoder2DecoderLayer,
-    Starcoder2ForCausalLM,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
-from transformers.utils import logging
-
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_starcoder2_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Cache] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-    **kwargs,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """
-    Copied from Starcoder2Attention.forward: https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/starcoder2/modeling_starcoder2.py
-    The only differences are:
-    - add new args token_idx
-    - optimize KV cache
-    """
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
-    bsz, q_len, _ = hidden_states.size()
-
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        if self.layer_idx is None:
-            raise ValueError(
-                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                "with a layer index."
-            )
-        if token_idx is not None and past_key_value.get_usable_length(kv_seq_len, self.layer_idx) > 0:
-            # When token_idx is used, static seq len = (input token len + max output token len)
-            kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        else:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-    if past_key_value is not None:
-        if token_idx is not None:
-            if 0 <= self.layer_idx < len(past_key_value.key_cache):
-                past_key_value.key_cache[self.layer_idx].index_copy_(2, token_idx - 1, key_states)
-                past_key_value.value_cache[self.layer_idx].index_copy_(2, token_idx - 1, value_states)
-                key_states = past_key_value.key_cache[self.layer_idx]
-                value_states = past_key_value.value_cache[self.layer_idx]
-            else:
-                past_key_value.key_cache.append(key_states)
-                past_key_value.value_cache.append(value_states)
-        else:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-        raise ValueError(
-            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-            f" {attn_weights.size()}"
-        )
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-            )
-
-        attn_weights = attn_weights + attention_mask
-
-    # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-
-    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-    attn_output = self.o_proj(attn_output)
-    attn_output = nn.functional.dropout(attn_output, p=self.residual_dropout, training=self.training)
-
-    if not output_attentions:
-        attn_weights = None
-
-    return attn_output, attn_weights, past_key_value
-
-
-class GaudiStarcoder2DecoderLayer(Starcoder2DecoderLayer):
-    def __init__(self, config: Starcoder2Config, layer_idx: int):
-        super().__init__(config, layer_idx)
-        self.self_attn = Starcoder2Attention(config, layer_idx)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        token_idx: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Copied from Starcoder2DecoderLayer.forward: https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/starcoder2/modeling_starcoder2.py
-        The only differences are:
-        - add new args token_idx
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            token_idx=token_idx,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-def gaudi_starcoder2_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.Tensor] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    """
-    Copied from Starcoder2Model.forward: https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/starcoder2/modeling_starcoder2.py
-    The only differences are:
-    - add new args token_idx
-    - replace _prepare_4d_causal_attention_mask with _gaudi_prepare_4d_causal_attention_mask
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-    elif input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    past_key_values_length = 0
-
-    if use_cache:
-        use_legacy_cache = not isinstance(past_key_values, Cache)
-        if use_legacy_cache:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        if token_idx is None:
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-        )
-        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-    else:
-        position_ids = position_ids.view(-1, seq_length).long()
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-
-    if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-        is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-        if is_padding_right:
-            raise ValueError(
-                "You are attempting to perform batched generation with padding_side='right'"
-                " this may lead to unexpected behaviour for Flash Attention version of Starcoder2. Make sure to "
-                " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-            )
-
-    # 4d mask is passed through the layers
-    attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-        attention_mask,
-        (batch_size, seq_length),
-        inputs_embeds,
-        past_key_values_length,
-        sliding_window=self.config.sliding_window,
-    )
-
-    hidden_states = inputs_embeds
-    hidden_states = nn.functional.dropout(hidden_states, p=self.embedding_dropout, training=self.training)
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = None
-
-    for decoder_layer in self.layers:
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                position_ids,
-                past_key_values,
-                output_attentions,
-                use_cache,
-                None,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                token_idx=token_idx,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = None
-    if use_cache:
-        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-class GaudiStarcoder2ForCausalLM(Starcoder2ForCausalLM):
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        token_idx: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        """
-        Inherits from Starcoder2ForCausalLM: https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/starcoder2/modeling_starcoder2.py
-        The only differences are:
-        - add new args token_idx
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            token_idx=token_idx,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        """
-        Inherits from Starcoder2ForCausalLM: https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/starcoder2/modeling_starcoder2.py
-        The only differences are:
-        - add new args token_idx
-        - add token_idx into model_inputs
-        - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
-        - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
-        """
-        token_idx = kwargs.get("token_idx", None)
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if token_idx is None:
-                if isinstance(past_key_values, Cache):
-                    cache_length = past_key_values.get_seq_length()
-                    past_length = past_key_values.seen_tokens
-                    max_cache_length = past_key_values.get_max_length()
-                else:
-                    cache_length = past_length = past_key_values[0][0].shape[2]
-                    max_cache_length = None
-
-                # Keep only the unprocessed tokens:
-                # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-                # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-                # input)
-                if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                    input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-                # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-                # input_ids based on the past_length.
-                elif past_length < input_ids.shape[1]:
-                    input_ids = input_ids[:, past_length:]
-                # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-                # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-                if (
-                    max_cache_length is not None
-                    and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
-                ):
-                    attention_mask = attention_mask[:, -max_cache_length:]
-            else:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if token_idx is not None:
-                    position_ids = torch.index_select(position_ids, 1, token_idx - 1)
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "token_idx": token_idx,
-            }
-        )
-        return model_inputs
diff --git a/optimum/habana/transformers/models/swin/__init__.py b/optimum/habana/transformers/models/swin/__init__.py
deleted file mode 100644
index 59dbee4d5d..0000000000
--- a/optimum/habana/transformers/models/swin/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modeling_swin import gaudi_swin_get_attn_mask
diff --git a/optimum/habana/transformers/models/swin/modeling_swin.py b/optimum/habana/transformers/models/swin/modeling_swin.py
deleted file mode 100644
index 9ea3b9d28c..0000000000
--- a/optimum/habana/transformers/models/swin/modeling_swin.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Swin Transformer model."""
-
-import torch
-from transformers.models.swin.modeling_swin import window_partition
-
-
-def gaudi_swin_get_attn_mask(self, height, width, dtype):
-    """
-    Copied from SwinLayer.get_attn_mask : https://github.com/huggingface/transformers/blob/main/src/transformers/models/swin/modeling_swin.py
-    The only difference is moving img_mask to hpu for performance
-    """
-    if self.shift_size > 0:
-        # calculate attention mask for SW-MSA
-        img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device="hpu")
-        height_slices = (
-            slice(0, -self.window_size),
-            slice(-self.window_size, -self.shift_size),
-            slice(-self.shift_size, None),
-        )
-        width_slices = (
-            slice(0, -self.window_size),
-            slice(-self.window_size, -self.shift_size),
-            slice(-self.shift_size, None),
-        )
-        count = 0
-        for height_slice in height_slices:
-            for width_slice in width_slices:
-                img_mask[:, height_slice, width_slice, :] = count
-                count += 1
-
-        mask_windows = window_partition(img_mask, self.window_size)
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-    else:
-        attn_mask = None
-
-    return attn_mask
diff --git a/optimum/habana/transformers/models/t5/__init__.py b/optimum/habana/transformers/models/t5/__init__.py
deleted file mode 100644
index e92116128d..0000000000
--- a/optimum/habana/transformers/models/t5/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .modeling_t5 import (
-    gaudi_t5_layernorm_forward,
-    gaudi_T5Attention_forward,
-    gaudi_T5Block_forward,
-    gaudi_T5ForConditionalGeneration_forward,
-    gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation,
-    gaudi_T5LayerSelfAttention_forward,
-    gaudi_T5Stack_forward,
-)
diff --git a/optimum/habana/transformers/models/t5/modeling_t5.py b/optimum/habana/transformers/models/t5/modeling_t5.py
deleted file mode 100644
index 17b0e49a97..0000000000
--- a/optimum/habana/transformers/models/t5/modeling_t5.py
+++ /dev/null
@@ -1,636 +0,0 @@
-import warnings
-from typing import Optional, Tuple, Union
-
-import habana_frameworks.torch.core as htcore
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput,
-)
-from transformers.models.t5.modeling_t5 import __HEAD_MASK_WARNING_MSG
-from transformers.utils import (
-    logging,
-)
-
-
-logger = logging.get_logger(__name__)
-
-try:
-    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
-except ImportError:
-    print("Not using HPU fused kernel for RMSNorm")
-    FusedRMSNorm = None
-
-
-def gaudi_t5_layernorm_forward(self, hidden_states):
-    """
-    Copied from T5LayerNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
-    The only differences are:
-        - override RMSNorm with Habana fused RMSNorm
-    """
-    if hidden_states.device.type == "hpu" and FusedRMSNorm:
-        orig_dtype = hidden_states.dtype
-        hidden_states = FusedRMSNorm.apply(hidden_states.float(), self.weight.float(), self.variance_epsilon)
-        return hidden_states.to(orig_dtype)
-    else:
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-def gaudi_T5Attention_forward(
-    self,
-    hidden_states,
-    mask=None,
-    key_value_states=None,
-    position_bias=None,
-    past_key_value=None,
-    layer_head_mask=None,
-    query_length=None,
-    use_cache=False,
-    output_attentions=False,
-    token_idx=None,
-):
-    # Input is (batch_size, seq_length, dim)
-    # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-    # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-    batch_size, seq_length = hidden_states.shape[:2]
-
-    real_seq_length = seq_length
-
-    if past_key_value is not None:
-        if len(past_key_value) != 2:
-            raise ValueError(
-                f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-            )
-        if token_idx is None:
-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
-        else:
-            real_seq_length = past_key_value[0].shape[2]
-
-    key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
-
-    def shape(states):
-        """projection"""
-        return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-    def unshape(states):
-        """reshape"""
-        return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-    def project(hidden_states, proj_layer, key_value_states, past_key_value):
-        """projects hidden states correctly to key/query states"""
-        if key_value_states is None:
-            # self-attn
-            # (batch_size, n_heads, seq_length, dim_per_head)
-            hidden_states = shape(proj_layer(hidden_states))
-        elif past_key_value is None:
-            # cross-attn
-            # (batch_size, n_heads, seq_length, dim_per_head)
-            hidden_states = shape(proj_layer(key_value_states))
-
-        if past_key_value is not None:
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, n_heads, key_length, dim_per_head)
-                if token_idx is None:
-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-                else:
-                    hidden_states = past_key_value.index_copy_(-2, token_idx - 1, hidden_states)
-            elif past_key_value.shape[2] != key_value_states.shape[1]:
-                # checking that the `sequence_length` of the `past_key_value` is the same as
-                # the provided `key_value_states` to support prefix tuning
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-            else:
-                # cross-attn
-                hidden_states = past_key_value
-        return hidden_states
-
-    # get query states
-    query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-
-    # get key/value states
-    key_states = project(
-        hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
-    )
-    value_states = project(
-        hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
-    )
-
-    # compute scores
-    scores = torch.matmul(
-        query_states, key_states.transpose(3, 2)
-    )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-
-    if position_bias is None:
-        if not self.has_relative_attention_bias:
-            position_bias = torch.zeros(
-                (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
-            )
-            if self.gradient_checkpointing and self.training:
-                position_bias.requires_grad = True
-        else:
-            position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
-
-        # if key and values are already calculated
-        # we want only the last query position bias
-        if past_key_value is not None:
-            if token_idx is None:
-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
-            else:
-                position_bias = position_bias.index_select(-2, token_idx - 1)
-
-        if mask is not None:
-            position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-
-    if self.pruned_heads:
-        mask = torch.ones(position_bias.shape[1])
-        mask[list(self.pruned_heads)] = 0
-        position_bias_masked = position_bias[:, mask.bool()]
-    else:
-        position_bias_masked = position_bias
-
-    scores += position_bias_masked
-    attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-        scores
-    )  # (batch_size, n_heads, seq_length, key_length)
-    if self.training:
-        htcore.mark_step()
-    attn_weights = nn.functional.dropout(
-        attn_weights, p=self.dropout, training=self.training
-    )  # (batch_size, n_heads, seq_length, key_length)
-    if self.training:
-        htcore.mark_step()
-
-    # Mask heads if we want to
-    if layer_head_mask is not None:
-        attn_weights = attn_weights * layer_head_mask
-
-    attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-    attn_output = self.o(attn_output)
-
-    present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-    outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
-
-    if output_attentions:
-        outputs = outputs + (attn_weights,)
-    return outputs
-
-
-def gaudi_T5LayerSelfAttention_forward(
-    self,
-    hidden_states,
-    attention_mask=None,
-    position_bias=None,
-    layer_head_mask=None,
-    past_key_value=None,
-    use_cache=False,
-    output_attentions=False,
-    token_idx=None,
-):
-    normed_hidden_states = self.layer_norm(hidden_states)
-    attention_output = self.SelfAttention(
-        normed_hidden_states,
-        mask=attention_mask,
-        position_bias=position_bias,
-        layer_head_mask=layer_head_mask,
-        past_key_value=past_key_value,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    hidden_states = hidden_states + self.dropout(attention_output[0])
-    outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
-    return outputs
-
-
-def gaudi_T5Block_forward(
-    self,
-    hidden_states,
-    attention_mask=None,
-    position_bias=None,
-    encoder_hidden_states=None,
-    encoder_attention_mask=None,
-    encoder_decoder_position_bias=None,
-    layer_head_mask=None,
-    cross_attn_layer_head_mask=None,
-    past_key_value=None,
-    use_cache=False,
-    output_attentions=False,
-    return_dict=True,
-    token_idx=None,
-):
-    if past_key_value is not None:
-        if not self.is_decoder:
-            logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
-        expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-        if len(past_key_value) != expected_num_past_key_values:
-            raise ValueError(
-                f"There should be {expected_num_past_key_values} past states. "
-                f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                f"Got {len(past_key_value)} past key / value states"
-            )
-
-        self_attn_past_key_value = past_key_value[:2]
-        cross_attn_past_key_value = past_key_value[2:]
-    else:
-        self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-    self_attention_outputs = self.layer[0](
-        hidden_states,
-        attention_mask=attention_mask,
-        position_bias=position_bias,
-        layer_head_mask=layer_head_mask,
-        past_key_value=self_attn_past_key_value,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
-    )
-    hidden_states, present_key_value_state = self_attention_outputs[:2]
-    attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
-
-    do_cross_attention = self.is_decoder and encoder_hidden_states is not None
-    if do_cross_attention:
-        # the actual query length is unknown for cross attention
-        # if using past key value states. Need to inject it here
-        if present_key_value_state is not None:
-            query_length = present_key_value_state[0].shape[2]
-        else:
-            query_length = None
-
-        cross_attention_outputs = self.layer[1](
-            hidden_states,
-            key_value_states=encoder_hidden_states,
-            attention_mask=encoder_attention_mask,
-            position_bias=encoder_decoder_position_bias,
-            layer_head_mask=cross_attn_layer_head_mask,
-            past_key_value=cross_attn_past_key_value,
-            query_length=query_length,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = cross_attention_outputs[0]
-
-        # Combine self attn and cross attn key value states
-        if present_key_value_state is not None:
-            present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-        # Keep cross-attention outputs and relative position weights
-        attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-    # Apply Feed Forward layer
-    hidden_states = self.layer[-1](hidden_states)
-
-    outputs = (hidden_states,)
-
-    if use_cache:
-        outputs = outputs + (present_key_value_state,) + attention_outputs
-    else:
-        outputs = outputs + attention_outputs
-
-    return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
-def gaudi_T5Stack_forward(
-    self,
-    input_ids=None,
-    attention_mask=None,
-    encoder_hidden_states=None,
-    encoder_attention_mask=None,
-    inputs_embeds=None,
-    head_mask=None,
-    cross_attn_head_mask=None,
-    past_key_values=None,
-    use_cache=None,
-    output_attentions=None,
-    output_hidden_states=None,
-    return_dict=None,
-    token_idx=None,
-):
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    if input_ids is not None and inputs_embeds is not None:
-        err_msg_prefix = "decoder_" if self.is_decoder else ""
-        raise ValueError(
-            f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
-        )
-    elif input_ids is not None:
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-    else:
-        err_msg_prefix = "decoder_" if self.is_decoder else ""
-        raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
-
-    if inputs_embeds is None:
-        if self.embed_tokens is None:
-            raise ValueError("You have to initialize the model with valid token embeddings")
-        inputs_embeds = self.embed_tokens(input_ids)
-
-    batch_size, seq_length = input_shape
-
-    # required mask seq length can be calculated via length of past
-    if token_idx is not None:
-        mask_seq_length = past_key_values[0][0].shape[2] if past_key_values is not None else seq_length
-    else:
-        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
-
-    if use_cache is True:
-        if not self.is_decoder:
-            raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
-
-    # initialize past_key_values with `None` if past does not exist
-    if past_key_values is None:
-        past_key_values = [None] * len(self.block)
-
-    if attention_mask is None:
-        attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-
-    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-    # ourselves in which case we just need to make it broadcastable to all heads.
-    extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
-
-    # If a 2D or 3D attention mask is provided for the cross-attention
-    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-    if self.is_decoder and encoder_hidden_states is not None:
-        encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-        encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-        if encoder_attention_mask is None:
-            encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long)
-        encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-    else:
-        encoder_extended_attention_mask = None
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # Prepare head mask if needed
-    head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-    cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
-    present_key_value_states = () if use_cache else None
-    all_hidden_states = () if output_hidden_states else None
-    all_attentions = () if output_attentions else None
-    all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-    position_bias = None
-    encoder_decoder_position_bias = None
-
-    hidden_states = self.dropout(inputs_embeds)
-
-    for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
-        layer_head_mask = head_mask[i]
-        cross_attn_layer_head_mask = cross_attn_head_mask[i]
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                layer_module.forward,
-                hidden_states,
-                extended_attention_mask,
-                position_bias,
-                encoder_hidden_states,
-                encoder_extended_attention_mask,
-                encoder_decoder_position_bias,
-                layer_head_mask,
-                cross_attn_layer_head_mask,
-                None,  # past_key_value is always None with gradient checkpointing
-                use_cache,
-                output_attentions,
-                True,
-                None,
-            )
-        else:
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask=extended_attention_mask,
-                position_bias=position_bias,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_extended_attention_mask,
-                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                layer_head_mask=layer_head_mask,
-                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=past_key_value,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                token_idx=token_idx,
-            )
-
-        # layer_outputs is a tuple with:
-        # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-        if use_cache is False:
-            layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
-
-        hidden_states, present_key_value_state = layer_outputs[:2]
-
-        # We share the position biases between the layers - the first layer store them
-        # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
-        # (cross-attention position bias), (cross-attention weights)
-        position_bias = layer_outputs[2]
-        if self.is_decoder and encoder_hidden_states is not None:
-            encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
-        # append next layer key value states
-        if use_cache:
-            present_key_value_states = present_key_value_states + (present_key_value_state,)
-
-        if output_attentions:
-            all_attentions = all_attentions + (layer_outputs[3],)
-            if self.is_decoder:
-                all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
-
-    hidden_states = self.final_layer_norm(hidden_states)
-    hidden_states = self.dropout(hidden_states)
-
-    # Add last layer
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(
-            v
-            for v in [
-                hidden_states,
-                present_key_value_states,
-                all_hidden_states,
-                all_attentions,
-                all_cross_attentions,
-            ]
-            if v is not None
-        )
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=present_key_value_states,
-        hidden_states=all_hidden_states,
-        attentions=all_attentions,
-        cross_attentions=all_cross_attentions,
-    )
-
-
-def gaudi_T5ForConditionalGeneration_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.FloatTensor] = None,
-    decoder_input_ids: Optional[torch.LongTensor] = None,
-    decoder_attention_mask: Optional[torch.BoolTensor] = None,
-    head_mask: Optional[torch.FloatTensor] = None,
-    decoder_head_mask: Optional[torch.FloatTensor] = None,
-    cross_attn_head_mask: Optional[torch.Tensor] = None,
-    encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    token_idx: Optional[torch.LongTensor] = None,
-) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-    if head_mask is not None and decoder_head_mask is None:
-        if self.config.num_layers == self.config.num_decoder_layers:
-            warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-            decoder_head_mask = head_mask
-
-    # Encode if needed (training, first prediction pass)
-    if encoder_outputs is None:
-        # Convert encoder inputs in embeddings if needed
-        encoder_outputs = self.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-        encoder_outputs = BaseModelOutput(
-            last_hidden_state=encoder_outputs[0],
-            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-        )
-
-    hidden_states = encoder_outputs[0]
-
-    if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-        # get decoder inputs from shifting lm labels to the right
-        decoder_input_ids = self._shift_right(labels)
-
-    # Decode
-    decoder_outputs = self.decoder(
-        input_ids=decoder_input_ids,
-        attention_mask=decoder_attention_mask,
-        inputs_embeds=decoder_inputs_embeds,
-        past_key_values=past_key_values,
-        encoder_hidden_states=hidden_states,
-        encoder_attention_mask=attention_mask,
-        head_mask=decoder_head_mask,
-        cross_attn_head_mask=cross_attn_head_mask,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        token_idx=token_idx,
-    )
-
-    sequence_output = decoder_outputs[0]
-
-    if self.config.tie_word_embeddings:
-        # Rescale output before projecting on vocab
-        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-        sequence_output = sequence_output * (self.model_dim**-0.5)
-
-    lm_logits = self.lm_head(sequence_output)
-
-    loss = None
-    if labels is not None:
-        loss_fct = CrossEntropyLoss(ignore_index=-100)
-        # move labels to correct device to enable PP
-        labels = labels.to(lm_logits.device)
-        loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
-        # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
-
-    if not return_dict:
-        output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
-        return ((loss,) + output) if loss is not None else output
-
-    return Seq2SeqLMOutput(
-        loss=loss,
-        logits=lm_logits,
-        past_key_values=decoder_outputs.past_key_values,
-        decoder_hidden_states=decoder_outputs.hidden_states,
-        decoder_attentions=decoder_outputs.attentions,
-        cross_attentions=decoder_outputs.cross_attentions,
-        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-        encoder_hidden_states=encoder_outputs.hidden_states,
-        encoder_attentions=encoder_outputs.attentions,
-    )
-
-
-def gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation(
-    self,
-    input_ids,
-    past_key_values=None,
-    attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    decoder_attention_mask=None,
-    cross_attn_head_mask=None,
-    use_cache=None,
-    encoder_outputs=None,
-    token_idx=None,
-    **kwargs,
-):
-    # cut decoder_input_ids if past_key_values is used
-    if past_key_values is not None:
-        if token_idx is not None:
-            input_ids = torch.index_select(input_ids, 1, token_idx - 1)
-        else:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-    return {
-        "decoder_input_ids": input_ids,
-        "past_key_values": past_key_values,
-        "encoder_outputs": encoder_outputs,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-        "use_cache": use_cache,
-        "token_idx": token_idx,
-    }
diff --git a/optimum/habana/transformers/models/vision_encoder_decoder/__init__.py b/optimum/habana/transformers/models/vision_encoder_decoder/__init__.py
deleted file mode 100644
index 6ad544727a..0000000000
--- a/optimum/habana/transformers/models/vision_encoder_decoder/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .modeling_vision_encoder_decoder import (
-    gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
-)
diff --git a/optimum/habana/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/optimum/habana/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
deleted file mode 100644
index 9c2c9dca2a..0000000000
--- a/optimum/habana/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation(
-    self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
-):
-    """
-    Copied from VideoEncoderDecoderModel.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py#L645
-    The only differences are:
-    - add token idx support
-    """
-    decoder_inputs = self.decoder.prepare_inputs_for_generation(
-        input_ids,
-        past_key_values=past_key_values,
-        attention_mask=kwargs.get("decoder_attention_mask", None),
-        token_idx=kwargs.get("token_idx", None),
-    )
-    decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
-    input_dict = {
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "decoder_input_ids": decoder_inputs["input_ids"],
-        "encoder_outputs": encoder_outputs,
-        "past_key_values": decoder_inputs["past_key_values"],
-        "use_cache": use_cache,
-        "decoder_position_ids": decoder_inputs.get("position_ids", None),
-        "decoder_token_idx": decoder_inputs.get("token_idx", None),
-    }
-    return input_dict
diff --git a/optimum/habana/transformers/models/vit/__init__.py b/optimum/habana/transformers/models/vit/__init__.py
deleted file mode 100644
index 0d3835de54..0000000000
--- a/optimum/habana/transformers/models/vit/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modeling_vit import gaudi_vit_self_attention_forward
diff --git a/optimum/habana/transformers/models/vit/modeling_vit.py b/optimum/habana/transformers/models/vit/modeling_vit.py
deleted file mode 100644
index 4fd5990e14..0000000000
--- a/optimum/habana/transformers/models/vit/modeling_vit.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Optional, Tuple, Union
-
-import torch
-
-
-def gaudi_vit_self_attention_forward(
-    self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
-) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-    """
-    Same method as transformers.models.vit.modeling_vit.ViTSelfAttention.forward with a small tweak:
-    the division is performed before the matmul for computing attention scores.
-    This gives better performance on HPU.
-    """
-
-    mixed_query_layer = self.query(hidden_states)
-
-    key_layer = self.transpose_for_scores(self.key(hidden_states))
-    value_layer = self.transpose_for_scores(self.value(hidden_states))
-    query_layer = self.transpose_for_scores(mixed_query_layer)
-
-    # Take the dot product between "query" and "key" to get the raw attention scores.
-    # The div has been put inside the matmul because it achieves better performance on HPU.
-    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2) / math.sqrt(self.attention_head_size))
-
-    # Normalize the attention scores to probabilities.
-    attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_probs = self.dropout(attention_probs)
-
-    # Mask heads if we want to
-    if head_mask is not None:
-        attention_probs = attention_probs * head_mask
-
-    context_layer = torch.matmul(attention_probs, value_layer)
-
-    context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-    new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-    context_layer = context_layer.view(new_context_layer_shape)
-
-    outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-    return outputs
diff --git a/optimum/habana/transformers/models/vits/__init__.py b/optimum/habana/transformers/models/vits/__init__.py
deleted file mode 100644
index b0cf4ecfeb..0000000000
--- a/optimum/habana/transformers/models/vits/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .modeling_vits import (
-    gaudi_unconstrained_rational_quadratic_spline,
-    gaudi_VitsResidualCouplingLayer_forward,
-)
diff --git a/optimum/habana/transformers/models/vits/modeling_vits.py b/optimum/habana/transformers/models/vits/modeling_vits.py
deleted file mode 100644
index 174e8a3ac9..0000000000
--- a/optimum/habana/transformers/models/vits/modeling_vits.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import numpy as np
-import torch
-from torch import nn
-from transformers.models.vits.modeling_vits import _rational_quadratic_spline
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def gaudi_unconstrained_rational_quadratic_spline(
-    inputs,
-    unnormalized_widths,
-    unnormalized_heights,
-    unnormalized_derivatives,
-    reverse=False,
-    tail_bound=5.0,
-    min_bin_width=1e-3,
-    min_bin_height=1e-3,
-    min_derivative=1e-3,
-):
-    """
-    Copied from _unconstrained_rational_quadratic_spline: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/vits/modeling_vits.py#L126
-    The only differences are:
-    - WA to fix hpu graph accuracy issue
-    """
-    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
-    outside_interval_mask = ~inside_interval_mask
-
-    outputs = torch.zeros_like(inputs)
-    log_abs_det = torch.zeros_like(inputs)
-    constant = np.log(np.exp(1 - min_derivative) - 1)
-
-    unnormalized_derivatives = nn.functional.pad(unnormalized_derivatives, pad=(1, 1))
-    unnormalized_derivatives[..., 0] = constant
-    unnormalized_derivatives[..., -1] = constant
-
-    outputs[outside_interval_mask] = inputs[outside_interval_mask]
-    log_abs_det[outside_interval_mask] = 0.0
-
-    outputs_i, log_abs_det_i = _rational_quadratic_spline(
-        inputs=inputs[inside_interval_mask],
-        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
-        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
-        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
-        reverse=reverse,
-        tail_bound=tail_bound,
-        min_bin_width=min_bin_width,
-        min_bin_height=min_bin_height,
-        min_derivative=min_derivative,
-    )
-    outputs = outputs_i * inside_interval_mask + outputs * outside_interval_mask
-    log_abs_det = log_abs_det_i * inside_interval_mask + log_abs_det * outside_interval_mask
-    return outputs, log_abs_det
-
-
-def gaudi_VitsResidualCouplingLayer_forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
-    """
-    Copied from VitsResidualCouplingLayer:forward: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/vits/modeling_vits.py
-    The only differences are:
-    - WA to fix torch.flip issue after conv1d
-    """
-    first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)
-    hidden_states = self.conv_pre(first_half) * padding_mask
-    hidden_states = self.wavenet(hidden_states, padding_mask, global_conditioning)
-    mean = self.conv_post(hidden_states) * padding_mask
-    log_stddev = torch.zeros_like(mean)
-
-    if not reverse:
-        second_half = mean.cpu() + second_half * torch.exp(log_stddev) * padding_mask
-        outputs = torch.cat([first_half, second_half], dim=1)
-        log_determinant = torch.sum(log_stddev, [1, 2])
-        return outputs, log_determinant
-    else:
-        second_half = (second_half - mean.cpu()) * torch.exp(-log_stddev) * padding_mask
-        outputs = torch.cat([first_half, second_half], dim=1)
-        return outputs, None
diff --git a/optimum/habana/transformers/models/wav2vec2/__init__.py b/optimum/habana/transformers/models/wav2vec2/__init__.py
deleted file mode 100644
index 84372061b6..0000000000
--- a/optimum/habana/transformers/models/wav2vec2/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .modeling_wav2vec2 import (
-    _gaudi_wav2vec2_compute_mask_indices,
-    _gaudi_wav2vec2_mask_hidden_states,
-    _gaudi_wav2vec2_sample_negative_indices,
-    gaudi_wav2vec2_encoder_forward,
-    gaudi_wav2vec2_forward,
-    gaudi_wav2vec2_tdnnlayer_forward,
-    gaudi_wav2vec2forctc_forward,
-)
diff --git a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
deleted file mode 100644
index c6dd9cb546..0000000000
--- a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ /dev/null
@@ -1,465 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-import torch
-from habana_frameworks.torch.hpu import get_device_name
-from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    CausalLMOutput,
-    Wav2Vec2BaseModelOutput,
-)
-from transformers.models.wav2vec2.modeling_wav2vec2 import _HIDDEN_STATES_START_POSITION
-
-
-try:
-    from habana_frameworks.torch.hpex.kernels import CTCLoss
-
-    custom_ctc_loss_fwd = CTCLoss.apply
-except ImportError:
-    print("Could not import Custom CTCLoss kernel. This Kernel is available only for SynapseAI >= 1.15.0")
-    custom_ctc_loss_fwd = None
-
-
-def _gaudi_wav2vec2_compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> torch.Tensor:
-    """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L135
-    The only differences are (1) that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers), (2) epsilon is generated on HPU instead of CPU, (3) check
-    to ensure indices are not larger than sequence length is re-written to avoid host sync.
-    """
-    batch_size, sequence_length = shape
-
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
-        )
-
-    # epsilon is used for probabilistic rounding
-    epsilon = torch.rand([], device="hpu")
-
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-
-        return num_masked_span
-
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-
-    # SpecAugment mask to fill
-    spec_aug_mask = torch.zeros((batch_size, sequence_length), dtype=torch.bool, device="hpu")
-    spec_aug_mask_idxs = []
-
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-
-    if max_num_masked_span == 0:
-        return spec_aug_mask
-
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
-
-        # get random indices to mask
-        spec_aug_mask_idx = torch.randperm(input_length - (mask_length - 1), device="hpu")[:num_masked_span]
-
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
-        else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
-
-        spec_aug_mask_idx = torch.cat(
-            [
-                spec_aug_mask_idx,
-                torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32, device="hpu") * dummy_mask_idx,
-            ]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx.to(dtype=torch.long))
-
-    spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs)
-
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = torch.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
-
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = torch.arange(mask_length, device="hpu")[None, None, :]
-    offsets = torch.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
-
-    # ensure that we cannot have indices larger than sequence_length
-    if get_device_name() == "GAUDI" or custom_ctc_loss_fwd is None:
-        if spec_aug_mask_idxs.max() > sequence_length - 1:
-            spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
-    else:
-        mask = (spec_aug_mask_idxs > sequence_length - 1) * (spec_aug_mask_idxs.max() > sequence_length - 1)
-        inverse_mask = torch.bitwise_not(mask)
-        spec_aug_mask_idxs = spec_aug_mask_idxs * inverse_mask + (sequence_length - 1) * mask
-
-    # scatter indices to mask
-    spec_aug_mask.scatter_(-1, spec_aug_mask_idxs, 1)
-
-    return spec_aug_mask
-
-
-def _gaudi_wav2vec2_sample_negative_indices(
-    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[torch.Tensor] = None
-):
-    """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L254
-    The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers).
-    """
-    batch_size, sequence_length = features_shape
-
-    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
-    sequence_length_range = torch.arange(sequence_length, device="hpu")
-
-    # get `num_negatives` random vector indices from the same utterance
-    sampled_negative_indices = torch.zeros(
-        shape=(batch_size, sequence_length, num_negatives), dtype=torch.int32, device="hpu"
-    )
-
-    mask_time_indices = (
-        mask_time_indices.bool()
-        if mask_time_indices is not None
-        else torch.ones(features_shape, dtype=torch.bool, device="hpu")
-    )
-
-    for batch_idx in range(batch_size):
-        high = mask_time_indices[batch_idx].sum() - 1
-        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
-
-        feature_indices = torch.broadcast_to(torch.arange(high + 1, device="hpu")[:, None], (high + 1, num_negatives))
-        sampled_indices = torch.randint(0, high, size=(high + 1, num_negatives), dtype=torch.int16, device="hpu")
-        # avoid sampling the same positive vector, but keep the distribution uniform
-        sampled_indices[sampled_indices >= feature_indices] += 1
-
-        # remap to actual indices
-        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
-
-        # correct for batch size
-        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
-
-    return sampled_negative_indices
-
-
-def gaudi_wav2vec2_forward(
-    self,
-    input_values: Optional[torch.Tensor],
-    attention_mask: Optional[torch.Tensor] = None,
-    mask_time_indices: Optional[torch.FloatTensor] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
-    """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1282
-    The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error.
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    extract_features = self.feature_extractor(input_values)
-    extract_features = extract_features.transpose(1, 2)
-
-    if attention_mask is not None:
-        # compute reduced attention_mask corresponding to feature vectors
-        attention_mask = self._get_feature_vector_attention_mask(
-            extract_features.shape[1], attention_mask, add_adapter=False
-        )
-
-    hidden_states, extract_features = self.feature_projection(extract_features)
-    hidden_states = self._mask_hidden_states(
-        hidden_states.clone(), mask_time_indices=mask_time_indices, attention_mask=attention_mask
-    )
-
-    encoder_outputs = self.encoder(
-        hidden_states,
-        attention_mask=attention_mask,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-    )
-
-    hidden_states = encoder_outputs[0]
-
-    if self.adapter is not None:
-        hidden_states = self.adapter(hidden_states)
-
-    if not return_dict:
-        return (hidden_states, extract_features) + encoder_outputs[1:]
-
-    return Wav2Vec2BaseModelOutput(
-        last_hidden_state=hidden_states,
-        extract_features=extract_features,
-        hidden_states=encoder_outputs.hidden_states,
-        attentions=encoder_outputs.attentions,
-    )
-
-
-def _gaudi_wav2vec2_mask_hidden_states(
-    self,
-    hidden_states: torch.FloatTensor,
-    mask_time_indices: Optional[torch.FloatTensor] = None,
-    attention_mask: Optional[torch.LongTensor] = None,
-):
-    """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1227
-    Differences are that (1) `mask_time_indices` is not moved to the current device and converted into boolean because this is already done in _compute_mask_indices.
-    (2) index_put operation on hidden_states is replaced by combination of simpler ops (more suitable for HPU graphs)
-    """
-
-    # `config.apply_spec_augment` can set masking to False
-    if not getattr(self.config, "apply_spec_augment", True):
-        return hidden_states
-
-    # generate indices & apply SpecAugment along time axis
-    batch_size, sequence_length, hidden_size = hidden_states.size()
-
-    if mask_time_indices is not None:
-        # apply SpecAugment along time axis with given mask_time_indices
-        hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
-    elif self.config.mask_time_prob > 0 and self.training:
-        mask_time_indices = _gaudi_wav2vec2_compute_mask_indices(
-            (batch_size, sequence_length),
-            mask_prob=self.config.mask_time_prob,
-            mask_length=self.config.mask_time_length,
-            attention_mask=attention_mask,
-            min_masks=self.config.mask_time_min_masks,
-        )
-        # replacement of index_put with combination of simpler ops. Assumption made about sizes of hidden_states (3d),
-        # mask_time_indices (2d), self.masked_spec_embed (1d), for any other combination better to go back to original code using index_put.
-        # hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
-        inverse_mask_time_indices = torch.bitwise_not(mask_time_indices)
-        hidden_states = hidden_states * inverse_mask_time_indices.unsqueeze(2) + self.masked_spec_embed.to(
-            hidden_states.dtype
-        ).expand(hidden_states.size()) * mask_time_indices.unsqueeze(2)
-
-    if self.config.mask_feature_prob > 0 and self.training:
-        # generate indices & apply SpecAugment along feature axis
-        mask_feature_indices = _gaudi_wav2vec2_compute_mask_indices(
-            (batch_size, hidden_size),
-            mask_prob=self.config.mask_feature_prob,
-            mask_length=self.config.mask_feature_length,
-            min_masks=self.config.mask_feature_min_masks,
-        )
-        mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
-        mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
-        hidden_states[mask_feature_indices] = 0
-
-    return hidden_states
-
-
-def gaudi_wav2vec2_encoder_forward(
-    self,
-    hidden_states: torch.tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    output_attentions: bool = False,
-    output_hidden_states: bool = False,
-    return_dict: bool = True,
-):
-    """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/7790943c91411f4234d11dfbf4c2f21ce7caf088/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L755
-    The only difference is that torch.rand device is set to 'hpu' (required to capture operation as part of HPU graph)
-    """
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attentions = () if output_attentions else None
-
-    if attention_mask is not None:
-        # make sure padded tokens output 0
-        expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
-        hidden_states[~expand_attention_mask] = 0
-
-        # extend attention_mask
-        attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
-        attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
-        attention_mask = attention_mask.expand(
-            attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
-        )
-
-    position_embeddings = self.pos_conv_embed(hidden_states)
-    hidden_states = hidden_states + position_embeddings
-    hidden_states = self.layer_norm(hidden_states)
-    hidden_states = self.dropout(hidden_states)
-
-    deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
-
-    for layer in self.layers:
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        dropout_probability = torch.rand([], device="hpu")
-
-        skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
-        if not skip_the_layer or deepspeed_zero3_is_enabled:
-            # under deepspeed zero3 all gpus must run in sync
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = layer(
-                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-                )
-            hidden_states = layer_outputs[0]
-
-        if skip_the_layer:
-            layer_outputs = (None, None)
-
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (layer_outputs[1],)
-
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
-    return BaseModelOutput(
-        last_hidden_state=hidden_states,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-    )
-
-
-def gaudi_wav2vec2_tdnnlayer_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-    """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L2290
-    v4.38.2 implementation caused accuracy issue to run pytest Wav2Vec2RobustModelTest.
-    """
-    hidden_states = hidden_states.unsqueeze(1)
-    hidden_states = torch.nn.functional.unfold(
-        hidden_states,
-        (self.kernel_size, self.in_conv_dim),
-        stride=(1, self.in_conv_dim),
-        dilation=(self.dilation, 1),
-    )
-    hidden_states = hidden_states.transpose(1, 2)
-    hidden_states = self.kernel(hidden_states)
-
-    hidden_states = self.activation(hidden_states)
-    return hidden_states
-
-
-def gaudi_wav2vec2forctc_forward(
-    self,
-    input_values: Optional[torch.Tensor],
-    attention_mask: Optional[torch.Tensor] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    labels: Optional[torch.Tensor] = None,
-) -> Union[Tuple, CausalLMOutput]:
-    """
-    copied from Transformers https://github.com/huggingface/transformers/blob/e770f0316d2a9b787c9d1440f204fcb65e176682/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1950
-    only differences are (1) attention_mask tensor generation using ones_like is done on HPU, (2) masked_select is not applied on labels to compute flattened_targets to avoid
-    changing flattened_targets tensor shapes across training iterations.
-    """
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-    outputs = self.wav2vec2(
-        input_values,
-        attention_mask=attention_mask,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-    )
-    hidden_states = outputs[0]
-    hidden_states = self.dropout(hidden_states)
-    logits = self.lm_head(hidden_states)
-    loss = None
-    if labels is not None:
-        if labels.max() >= self.config.vocab_size:
-            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
-        # retrieve loss input_lengths from attention_mask
-        attention_mask = (
-            attention_mask
-            if attention_mask is not None
-            else torch.ones_like(input_values, dtype=torch.long, device="hpu")
-        )
-        input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
-        # assuming that padded tokens are filled with -100
-        # when not being attended to
-        labels_mask = labels >= 0
-        target_lengths = labels_mask.sum(-1)
-        # ctc_loss doesn't support fp16
-        log_probs = torch.nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
-        if get_device_name() == "GAUDI" or custom_ctc_loss_fwd is None:
-            flattened_targets = labels.masked_select(labels_mask)
-            loss = torch.nn.functional.ctc_loss(
-                log_probs,
-                flattened_targets,
-                input_lengths,
-                target_lengths,
-                blank=self.config.pad_token_id,
-                reduction=self.config.ctc_loss_reduction,
-                zero_infinity=self.config.ctc_zero_infinity,
-            )
-        else:
-            flattened_targets = labels
-            loss = custom_ctc_loss_fwd(
-                log_probs,
-                flattened_targets,
-                input_lengths,
-                target_lengths,
-                self.config.pad_token_id,
-                self.config.ctc_loss_reduction,
-                self.config.ctc_zero_infinity,
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
-        return ((loss,) + output) if loss is not None else output
-    return CausalLMOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
deleted file mode 100644
index b36a637b98..0000000000
--- a/optimum/habana/transformers/trainer.py
+++ /dev/null
@@ -1,2388 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import copy
-import importlib.metadata
-import inspect
-import math
-import os
-import random
-import shutil
-import sys
-import time
-import warnings
-from collections.abc import Mapping
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-
-import huggingface_hub.utils as hf_hub_utils
-import numpy as np
-import torch
-from accelerate import skip_first_batches
-from accelerate.data_loader import SeedableRandomSampler
-from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin, save_fsdp_model
-from huggingface_hub import upload_folder
-from packaging import version
-from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler
-from transformers import Trainer
-from transformers.data.data_collator import DataCollator
-from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
-from transformers.integrations import hp_params
-from transformers.integrations.deepspeed import (
-    deepspeed_load_checkpoint,
-    is_deepspeed_available,
-    is_deepspeed_zero3_enabled,
-)
-from transformers.modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from transformers.trainer import _get_fsdp_ckpt_kwargs
-from transformers.trainer_callback import TrainerCallback, TrainerState
-from transformers.trainer_pt_utils import (
-    DistributedTensorGatherer,
-    EvalLoopContainer,
-    IterableDatasetShard,
-    LengthGroupedSampler,
-    SequentialDistributedSampler,
-    find_batch_size,
-    get_dataloader_sampler,
-    get_model_param_count,
-    nested_concat,
-    nested_detach,
-    reissue_pt_warnings,
-    remove_dummy_checkpoint,
-)
-from transformers.trainer_utils import (
-    PREFIX_CHECKPOINT_DIR,
-    EvalLoopOutput,
-    EvalPrediction,
-    HPSearchBackend,
-    HubStrategy,
-    IntervalStrategy,
-    TrainOutput,
-    denumpify_detensorize,
-    enable_full_determinism,
-    find_executable_batch_size,
-    get_last_checkpoint,
-    has_length,
-)
-from transformers.training_args import ParallelMode, TrainingArguments
-from transformers.utils import (
-    ADAPTER_SAFE_WEIGHTS_NAME,
-    ADAPTER_WEIGHTS_NAME,
-    CONFIG_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    PushInProgress,
-    is_accelerate_available,
-    is_datasets_available,
-    is_peft_available,
-    is_safetensors_available,
-)
-
-from optimum.utils import logging
-
-from ..accelerate import GaudiAccelerator
-from ..accelerate.utils import FP8ContextWrapper, GaudiDistributedType
-from ..utils import (
-    HabanaProfile,
-    get_hpu_memory_stats,
-    set_seed,
-    speed_metrics,
-    to_device_dtype,
-)
-from .gaudi_configuration import GAUDI_CONFIG_NAME, GaudiConfig
-from .integrations.deepspeed import deepspeed_init
-from .trainer_utils import convert_into_dtypes, get_dtype
-from .training_args import GaudiTrainingArguments
-
-
-if is_datasets_available():
-    import datasets
-
-
-if is_safetensors_available():
-    import safetensors.torch
-
-
-if is_peft_available():
-    from peft import PeftModel
-    from peft.utils import PeftType
-
-
-if is_deepspeed_available():
-    from accelerate.utils import DeepSpeedSchedulerWrapper
-
-if is_accelerate_available():
-    from accelerate.utils import (
-        load_fsdp_optimizer,
-        save_fsdp_optimizer,
-    )
-
-if TYPE_CHECKING:
-    import optuna
-
-
-DATA_SAMPLERS = [RandomSampler, SeedableRandomSampler]
-
-
-if is_accelerate_available("0.28.0"):
-    from accelerate.utils import DataLoaderConfiguration
-
-
-def _is_peft_model(model):
-    if is_peft_available():
-        classes_to_check = (PeftModel,) if is_peft_available() else ()
-        # Here we also check if the model is an instance of `PeftMixedModel` introduced in peft>=0.7.0: https://github.com/huggingface/transformers/pull/28321
-        if version.parse(importlib.metadata.version("peft")) >= version.parse("0.7.0"):
-            from peft import PeftMixedModel
-
-            classes_to_check = (*classes_to_check, PeftMixedModel)
-        return isinstance(model, classes_to_check)
-    return False
-
-
-if TYPE_CHECKING:
-    if is_datasets_available():
-        import datasets
-
-
-logger = logging.get_logger(__name__)
-
-
-# Name of the files used for checkpointing
-TRAINING_ARGS_NAME = "training_args.bin"
-TRAINER_STATE_NAME = "trainer_state.json"
-OPTIMIZER_NAME = "optimizer.pt"
-OPTIMIZER_NAME_BIN = "optimizer.bin"
-SCHEDULER_NAME = "scheduler.pt"
-SCALER_NAME = "scaler.pt"
-
-
-class GaudiTrainer(Trainer):
-    """
-    GaudiTrainer is built on top of the tranformers' Trainer to enable
-    deployment on Habana's Gaudi.
-    """
-
-    def __init__(
-        self,
-        model: Union[PreTrainedModel, torch.nn.Module] = None,
-        gaudi_config: GaudiConfig = None,
-        args: TrainingArguments = None,
-        data_collator: Optional[DataCollator] = None,
-        train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-        model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-    ):
-        if args is None:
-            output_dir = "tmp_trainer"
-            logger.info(f"No `GaudiTrainingArguments` passed, using `output_dir={output_dir}`.")
-            args = GaudiTrainingArguments(output_dir=output_dir)
-
-        self.use_hpu_amp = False
-        self.use_cpu_amp = False
-        if args.bf16 and not args.deepspeed:
-            if args.half_precision_backend == "hpu_amp":
-                self.use_hpu_amp = True
-            else:
-                self.use_cpu_amp = True
-
-            # Workaround to not set amp backend again when calling super().__init__(...)
-            # args.bf16 is not used after the __init__ anyway
-            args.bf16 = False
-
-        super().__init__(
-            model,
-            args,
-            data_collator,
-            train_dataset,
-            eval_dataset,
-            tokenizer,
-            model_init,
-            compute_metrics,
-            callbacks,
-            optimizers,
-            preprocess_logits_for_metrics,
-        )
-
-        if gaudi_config is None:
-            self.gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name)
-        else:
-            self.gaudi_config = copy.deepcopy(gaudi_config)
-
-        if self.args.use_habana:
-            if self.args.use_hpu_graphs_for_inference:
-                self.already_wrapped_for_hpu_graphs = False
-
-            if self.args.deepspeed:
-                # Mixed-precision backends are turned off when using DeepSpeed since it manages this itself
-                self.gaudi_config.use_torch_autocast = False
-                self.use_hpu_amp = False
-
-            if self.args.deepspeed or self.args.dataloader_num_workers >= 1:
-                # To avoid warnings about parallelism in tokenizers
-                os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-            if self.gaudi_config.use_torch_autocast:
-                if not self.use_hpu_amp and not self.use_cpu_amp:
-                    self.use_hpu_amp = True
-                    logger.warning(
-                        "The argument `--bf16` was not given but `use_torch_autocast` is True in the Gaudi configuration so mixed-precision training with Torch Autocast is enabled."
-                    )
-
-            if self.use_hpu_amp and "LOWER_LIST" not in os.environ:
-                self.gaudi_config.declare_autocast_bf16_fp32_ops()
-
-            if self.args.use_lazy_mode:
-                try:
-                    import habana_frameworks.torch.core as htcore
-                except ImportError as error:
-                    error.msg = f"Could not import habana_frameworks.torch.core. {error.msg}."
-                    raise error
-                self.htcore = htcore
-
-                try:
-                    import habana_frameworks.torch.hpu as hthpu
-                except ImportError as error:
-                    error.msg = f"Could not import habana_frameworks.torch.hpu. {error.msg}."
-                    raise error
-                if self.gaudi_config.use_dynamic_shapes:
-                    hthpu.enable_dynamic_shape()
-                else:
-                    hthpu.disable_dynamic_shape()
-
-            try:
-                from habana_frameworks.torch.hpu import random as hpu_random
-            except ImportError as error:
-                error.msg = f"Could not import habana_frameworks.torch.hpu.random. {error.msg}."
-                raise error
-            self.hpu_random = hpu_random
-
-        # Set the correct log level depending on the node
-        # Already done in super().init() but we have to do it again
-        # because we use optimum.utils.logging here and not
-        # transformers.utils.logging
-        log_level = args.get_process_log_level()
-        logging.set_verbosity(log_level)
-        logging.enable_default_handler()
-        logging.enable_explicit_format()
-
-        # Suppress PyTorch autocast warnings with Wav2Vec2
-        # This is a bug in PyTorch
-        warnings.filterwarnings(
-            "ignore", message="User provided device_type of 'cuda', but CUDA is not available. Disabling"
-        )
-
-    def _move_model_to_device(self, model, device):
-        model = model.to(device)
-        # Moving a model to HPU disconnects the tied weights, so we have to retie them.
-        if self.args.use_habana and hasattr(model, "tie_weights"):
-            model.tie_weights()
-
-    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
-        if self.train_dataset is None or not has_length(self.train_dataset):
-            return None
-
-        # Build the sampler.
-        if self.args.group_by_length:
-            if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset):
-                lengths = (
-                    self.train_dataset[self.args.length_column_name]
-                    if self.args.length_column_name in self.train_dataset.column_names
-                    else None
-                )
-            else:
-                lengths = None
-            model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
-            return LengthGroupedSampler(
-                self.args.train_batch_size * self.args.gradient_accumulation_steps,
-                dataset=self.train_dataset,
-                lengths=lengths,
-                model_input_name=model_input_name,
-            )
-
-        else:
-            num_samples = len(self.train_dataset)
-            if (
-                not self.args.dataloader_drop_last
-                and len(self.train_dataset) % self.args.per_device_train_batch_size != 0
-                and self.args.parallel_mode != ParallelMode.DISTRIBUTED
-            ):
-                # Make the total number of samples divisible by the batch size in lazy mode if needed
-                num_samples += (
-                    self.args.per_device_train_batch_size
-                    - len(self.train_dataset) % self.args.per_device_train_batch_size
-                )
-            return RandomSampler(self.train_dataset, num_samples=num_samples)
-
-    def create_optimizer(self):
-        """
-        Setup the optimizer.
-
-        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
-        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
-        """
-        if self.optimizer is None:
-            decay_parameters = self.get_decay_parameter_names(self.model)
-
-            optimizer_grouped_parameters = []
-            for t_params, t_weight_decay in zip(
-                [
-                    [p for n, p in self.model.named_parameters() if n in decay_parameters and p.requires_grad],
-                    [p for n, p in self.model.named_parameters() if n not in decay_parameters and p.requires_grad],
-                ],
-                [self.args.weight_decay, 0.0],
-            ):
-                # Empty groups of parameters are filtered because they make FusedAdamW crash
-                if t_params:
-                    optimizer_grouped_parameters.append(
-                        {
-                            "params": t_params,
-                            "weight_decay": t_weight_decay,
-                        }
-                    )
-
-            if self.gaudi_config.use_fused_adam and self.args.use_habana:
-                try:
-                    from habana_frameworks.torch.hpex.optimizers import FusedAdamW
-                except ImportError as error:
-                    error.msg = (
-                        f"Could not import 'FusedAdamW' from 'habana_frameworks.torch.hpex.optimizers'. {error.msg}."
-                    )
-                    raise error
-                optimizer_cls = FusedAdamW
-                optimizer_kwargs = {
-                    "lr": self.args.learning_rate,
-                    "betas": (self.args.adam_beta1, self.args.adam_beta2),
-                    "eps": self.args.adam_epsilon,
-                }
-            else:
-                optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args, self.model)
-
-            # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs`
-            # e.g. for GaLore optimizer.
-            if "params" in optimizer_kwargs:
-                optimizer_grouped_parameters = optimizer_kwargs.pop("params")
-
-            # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict`
-            # to avoid arguments conflicts.
-            if "optimizer_dict" in optimizer_kwargs:
-                optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict")
-
-            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
-
-        return self.optimizer
-
-    def _tune_save_checkpoint(self, checkpoint_dir: str):
-        output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
-        self.save_model(output_dir, _internal_call=True)
-        if self.args.should_save:
-            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
-            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
-            torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-
-    def _wrap_model(self, model, training=True, dataloader=None):
-        # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
-        if unwrap_model(model) is not model:
-            return model
-
-        # Note: in torch.distributed mode, there's no point in wrapping the model
-        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.
-        if not training:
-            return model
-
-        if self.args.parallel_mode == ParallelMode.DISTRIBUTED and self.args.distribution_strategy == "ddp":
-            kwargs = {}
-
-            kwargs["find_unused_parameters"] = self.args.ddp_find_unused_parameters
-            if self.args.ddp_find_unused_parameters and self.args.gradient_checkpointing:
-                logger.warning(
-                    "ddp_find_unused_parameters and gradient_checkpointing are both True, which may lead to an error:"
-                    " https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021"
-                )
-            kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb
-
-            if self.args.use_habana:
-                kwargs["gradient_as_bucket_view"] = True
-
-            if self.args.ddp_broadcast_buffers is not None:
-                kwargs["broadcast_buffers"] = self.args.ddp_broadcast_buffers
-
-            self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
-
-        if self.args.use_hpu_graphs_for_training:
-            import habana_frameworks.torch as ht
-
-            ht.hpu.ModuleCacher()(model=model, inplace=True)
-
-        return model
-
-    def train(
-        self,
-        resume_from_checkpoint: Optional[Union[str, bool]] = None,
-        trial: Union["optuna.Trial", Dict[str, Any]] = None,
-        ignore_keys_for_eval: Optional[List[str]] = None,
-        **kwargs,
-    ):
-        """
-        Main training entry point.
-        Args:
-            resume_from_checkpoint (`str` or `bool`, *optional*):
-                If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
-                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
-                of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
-            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
-                The trial run or the hyperparameter dictionary for hyperparameter search.
-            ignore_keys_for_eval (`List[str]`, *optional*)
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions for evaluation during the training.
-            kwargs (`Dict[str, Any]`, *optional*):
-                Additional keyword arguments used to hide deprecated arguments
-        """
-        if resume_from_checkpoint is False:
-            resume_from_checkpoint = None
-
-        # memory metrics - must set up as early as possible
-        self._memory_tracker.start()
-
-        args = self.args
-
-        self.is_in_train = True
-
-        # Attach NEFTune hooks if necessary
-        if self.neftune_noise_alpha is not None:
-            self.model = self._activate_neftune(self.model)
-
-        # do_train is not a reliable argument, as it might not be set and .train() still called, so
-        # the following is a workaround:
-        if (args.fp16_full_eval or args.bf16_full_eval) and not args.do_train:
-            self._move_model_to_device(self.model, args.device)
-
-        if "model_path" in kwargs:
-            resume_from_checkpoint = kwargs.pop("model_path")
-            warnings.warn(
-                (
-                    "`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` "
-                    "instead."
-                ),
-                FutureWarning,
-            )
-        if len(kwargs) > 0:
-            raise TypeError(f"train() received got unexpected keyword arguments: {', '.join(list(kwargs.keys()))}.")
-        # This might change the seed so needs to run first.
-        self._hp_search_setup(trial)
-        self._train_batch_size = self.args.train_batch_size
-
-        # Model re-init
-        model_reloaded = False
-        if self.model_init is not None:
-            # Seed must be set before instantiating the model when using model_init.
-            if self.args.full_determinism:
-                enable_full_determinism(self.args.seed)
-            else:
-                set_seed(self.args.seed)
-            self.model = self.call_model_init(trial)
-            model_reloaded = True
-            # Reinitializes optimizer and scheduler
-            self.optimizer, self.lr_scheduler = None, None
-
-        # Load potential model checkpoint
-        if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
-            resume_from_checkpoint = get_last_checkpoint(args.output_dir)
-            if resume_from_checkpoint is None:
-                raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
-
-        if resume_from_checkpoint is not None:
-            if not self.is_deepspeed_enabled and not self.is_fsdp_enabled:
-                self._load_from_checkpoint(resume_from_checkpoint)
-            # In case of repeating the find_executable_batch_size, set `self._train_batch_size` properly
-            state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
-            if state.train_batch_size is not None:
-                self._train_batch_size = state.train_batch_size
-
-        # If model was re-initialized, put it on the right device and update self.model_wrapped
-        if model_reloaded:
-            if self.place_model_on_device:
-                self._move_model_to_device(self.model, args.device)
-            self.model_wrapped = self.model
-
-        inner_training_loop = find_executable_batch_size(
-            self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
-        )
-
-        if args.push_to_hub:
-            try:
-                # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
-                hf_hub_utils.disable_progress_bars()
-                return inner_training_loop(
-                    args=args,
-                    resume_from_checkpoint=resume_from_checkpoint,
-                    trial=trial,
-                    ignore_keys_for_eval=ignore_keys_for_eval,
-                )
-            finally:
-                hf_hub_utils.enable_progress_bars()
-        else:
-            return inner_training_loop(
-                args=args,
-                resume_from_checkpoint=resume_from_checkpoint,
-                trial=trial,
-                ignore_keys_for_eval=ignore_keys_for_eval,
-            )
-
-    def _inner_training_loop(
-        self,
-        batch_size=None,
-        args=None,
-        resume_from_checkpoint=None,
-        trial=None,
-        ignore_keys_for_eval=None,
-    ):
-        self.accelerator.free_memory()
-        self._train_batch_size = batch_size
-        if self.args.auto_find_batch_size:
-            if self.state.train_batch_size != self._train_batch_size:
-                from accelerate.utils import release_memory
-
-                (self.model_wrapped,) = release_memory(self.model_wrapped)
-                self.model_wrapped = self.model
-
-                # Check for DeepSpeed *after* the intial pass and modify the config
-                if self.is_deepspeed_enabled:
-                    # Temporarily unset `self.args.train_batch_size`
-                    original_bs = self.args.per_device_train_batch_size
-                    self.args.per_device_train_batch_size = self._train_batch_size // max(1, self.args.n_gpu)
-                    self.propagate_args_to_deepspeed(True)
-                    self.args.per_device_train_batch_size = original_bs
-            self.state.train_batch_size = self._train_batch_size
-        logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
-        # Data loader and number of training steps
-        train_dataloader = self.get_train_dataloader()
-
-        # Setting up training control variables:
-        # number of training epochs: num_train_epochs
-        # number of training steps per epoch: num_update_steps_per_epoch
-        # total number of training steps to execute: max_steps
-        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
-
-        len_dataloader = None
-        num_train_tokens = None
-        if has_length(train_dataloader):
-            len_dataloader = len(train_dataloader)
-            num_update_steps_per_epoch = len(train_dataloader) // args.gradient_accumulation_steps
-            num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
-            num_examples = self.num_examples(train_dataloader)
-            if args.max_steps > 0:
-                max_steps = args.max_steps
-                num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
-                    args.max_steps % num_update_steps_per_epoch > 0
-                )
-                # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
-                # the best we can do.
-                num_train_samples = args.max_steps * total_train_batch_size
-                if args.include_tokens_per_second:
-                    num_train_tokens = (
-                        self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
-                    )
-            else:
-                max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
-                num_train_epochs = math.ceil(args.num_train_epochs)
-                num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
-                if args.include_tokens_per_second:
-                    num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs
-        elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
-            max_steps = args.max_steps
-            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
-            num_train_epochs = sys.maxsize
-            num_update_steps_per_epoch = max_steps
-            num_examples = total_train_batch_size * args.max_steps
-            num_train_samples = args.max_steps * total_train_batch_size
-            if args.include_tokens_per_second:
-                num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
-        else:
-            raise ValueError(
-                "args.max_steps must be set to a positive value if dataloader does not have a length, was"
-                f" {args.max_steps}"
-            )
-
-        if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
-            debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
-
-        delay_optimizer_creation = self.is_fsdp_enabled
-
-        # We need to reset the scheduler, as its parameters may be different on subsequent calls
-        if self._created_lr_scheduler:
-            self.lr_scheduler = None
-            self._created_lr_scheduler = False
-
-        if self.is_deepspeed_enabled:
-            self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
-
-        if not delay_optimizer_creation:
-            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
-
-        self.state = TrainerState()
-        self.state.is_hyper_param_search = trial is not None
-        self.state.train_batch_size = self._train_batch_size
-
-        # Compute absolute values for logging, eval, and save if given as ratio
-        if args.logging_steps is not None:
-            if args.logging_steps < 1:
-                self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
-            else:
-                self.state.logging_steps = args.logging_steps
-        if args.eval_steps is not None:
-            if args.eval_steps < 1:
-                self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
-            else:
-                self.state.eval_steps = args.eval_steps
-        if args.save_steps is not None:
-            if args.save_steps < 1:
-                self.state.save_steps = math.ceil(max_steps * args.save_steps)
-            else:
-                self.state.save_steps = args.save_steps
-
-        # Activate gradient checkpointing if needed
-        if args.gradient_checkpointing:
-            if args.gradient_checkpointing_kwargs is None:
-                gradient_checkpointing_kwargs = {}
-            else:
-                gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs
-
-            import transformers.modeling_utils
-
-            if args.deepspeed:
-                from deepspeed.runtime.activation_checkpointing.checkpointing import CheckpointFunction
-
-                # HACK because outputs should always be tuples
-                def hpu_deepspeed_checkpointing(function, *checkpoint_args):
-                    """DeepSpeed acitvation checkpointing."""
-                    all_outputs = []
-                    CheckpointFunction.apply(function, all_outputs, *checkpoint_args)
-                    # Always return a tuple
-                    # When all_outputs contains only one element, DeepSpeed returns this element instead of a tuple
-                    # which is not consistent with some models. See https://github.com/microsoft/DeepSpeed/issues/1057.
-                    return tuple(all_outputs)
-
-                torch.utils.checkpoint.checkpoint = hpu_deepspeed_checkpointing
-                transformers.modeling_utils.checkpoint = hpu_deepspeed_checkpointing
-            elif args.use_lazy_mode:
-                from .gradient_checkpointing import checkpoint as lazy_mode_checkpointing
-
-                torch.utils.checkpoint.checkpoint = lazy_mode_checkpointing
-                transformers.modeling_utils.checkpoint = lazy_mode_checkpointing
-
-            self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
-
-            # Wrap `_gradient_checkpointing_func` in the model with `transformer_engine` `activation_checkpointing` context.
-            if self.accelerator.state.is_fp8_enabled:
-                FP8ContextWrapper.gradient_checkpointing_wrap(self.model)
-        else:
-            # Hack because `RegressionModel` in test_trainer.py doesn't have `gradient_checkpointing_disable`
-            if hasattr(self.model, "gradient_checkpointing_disable"):
-                self.model.gradient_checkpointing_disable()
-
-        model = self._wrap_model(self.model_wrapped)
-
-        # as the model is wrapped, don't use `accelerator.prepare`
-        # this is for unhandled cases such as
-        # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
-        use_accelerator_prepare = True if model is self.model else False
-
-        if delay_optimizer_creation:
-            if use_accelerator_prepare:
-                self._fsdp_qlora_plugin_updates()
-                self.model = self.accelerator.prepare(self.model)
-            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
-
-        # prepare using `accelerator` prepare
-        if use_accelerator_prepare:
-            self.model.train()
-            if hasattr(self.lr_scheduler, "step"):
-                model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
-            else:
-                # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
-                model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
-                    self.model, self.optimizer, self.lr_scheduler
-                )
-
-        if self.is_fsdp_enabled:
-            self.model = self.model_wrapped = model
-
-        # for the rest of this function `model` is the outside model, whether it was wrapped or not
-        if model is not self.model:
-            self.model_wrapped = model
-
-        # backward compatibility
-        if self.is_deepspeed_enabled:
-            self.deepspeed = self.model_wrapped
-
-        # ckpt loading
-        if resume_from_checkpoint is not None:
-            if self.is_deepspeed_enabled:
-                deepspeed_load_checkpoint(
-                    self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model)
-                )
-            elif self.is_fsdp_enabled:
-                self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)
-
-        # Check if saved optimizer or scheduler states exist
-        self._load_optimizer_and_scheduler(resume_from_checkpoint)
-
-        if self.gaudi_config.use_fused_clip_norm:
-            try:
-                from habana_frameworks.torch.hpex.normalization import FusedClipNorm
-            except ImportError as error:
-                error.msg = (
-                    f"Could not import 'FusedClipNorm' from 'habana_frameworks.torch.hpex.normalization'. {error.msg}."
-                )
-                raise error
-            self.FusedNorm = FusedClipNorm(
-                model.parameters(),
-                args.max_grad_norm,
-            )
-
-        # important: at this point:
-        # self.model         is the Transformers Model
-        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc.
-        # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.
-
-        # Train!
-        logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {num_examples:,}")
-        logger.info(f"  Num Epochs = {num_train_epochs:,}")
-        logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
-        if self.args.per_device_train_batch_size != self._train_batch_size:
-            logger.info(f"  Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
-        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
-        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-        logger.info(f"  Total optimization steps = {max_steps:,}")
-        logger.info(f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
-
-        self.state.epoch = 0
-        start_time = time.time()
-        start_time_after_warmup = None
-        epochs_trained = 0
-        steps_trained_in_current_epoch = 0
-        steps_trained_progress_bar = None
-
-        # Check if continuing training from a checkpoint
-        if resume_from_checkpoint is not None and os.path.isfile(
-            os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
-        ):
-            self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
-            self.compare_trainer_and_checkpoint_args(self.args, self.state)
-            epochs_trained = self.state.global_step // num_update_steps_per_epoch
-            if not args.ignore_data_skip:
-                steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
-                steps_trained_in_current_epoch *= args.gradient_accumulation_steps
-            else:
-                steps_trained_in_current_epoch = 0
-
-            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-            logger.info(f"  Continuing training from epoch {epochs_trained}")
-            logger.info(f"  Continuing training from global step {self.state.global_step}")
-            if not args.ignore_data_skip:
-                logger.info(
-                    f"  Will skip the first {epochs_trained} epochs then the first"
-                    f" {steps_trained_in_current_epoch} batches in the first epoch."
-                )
-
-        # In multi-worker training: broadcast model parameters from worker:0 to all the others.
-        # This must be done manually unless DistributedDataParallel is used.
-        if self.args.parallel_mode == ParallelMode.DISTRIBUTED and self.args.distribution_strategy == "fast_ddp":
-            from ..distributed import all_reduce_gradients
-
-            logger.debug(
-                f"Broadcasting the model parameters to assure that each of {self.args.world_size} workers start the training from the same point."
-            )
-            for param in model.parameters():
-                torch.distributed.broadcast(param.data, src=0)
-
-        # Update the references
-        self.callback_handler.model = self.model
-        self.callback_handler.optimizer = self.optimizer
-        self.callback_handler.lr_scheduler = self.lr_scheduler
-        self.callback_handler.train_dataloader = train_dataloader
-        if self.hp_name is not None and self._trial is not None:
-            # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial
-            # parameter to Train when using DDP.
-            self.state.trial_name = self.hp_name(self._trial)
-        if trial is not None:
-            assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
-            self.state.trial_params = hp_params(assignments)
-        else:
-            self.state.trial_params = None
-        # This should be the same if the state has been saved but in case the training arguments changed, it's safer
-        # to set this after the load.
-        self.state.max_steps = max_steps
-        self.state.num_train_epochs = num_train_epochs
-        self.state.is_local_process_zero = self.is_local_process_zero()
-        self.state.is_world_process_zero = self.is_world_process_zero()
-
-        # tr_loss is a tensor to avoid synchronization of TPUs through .item()
-        tr_loss = torch.tensor(0.0).to(args.device)
-        # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
-        self._total_loss_scalar = 0.0
-        self._globalstep_last_logged = self.state.global_step
-
-        self._zero_model_grad(model)
-        _grad_norm: Optional[float] = None
-
-        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
-
-        # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
-        if not args.ignore_data_skip:
-            for epoch in range(epochs_trained):
-                sampler = get_dataloader_sampler(train_dataloader)
-                sampler_kinds = [RandomSampler, SeedableRandomSampler]
-                is_random_sampler = isinstance(sampler, tuple(sampler_kinds))
-                if not is_random_sampler:
-                    # We just need to begin an iteration to create the randomization of the sampler.
-                    for _ in train_dataloader:
-                        break
-                else:
-                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
-                    # AT THE VERY END!
-                    sampler = sampler if sampler is not None else []
-                    _ = list(sampler)
-
-        if self.args.adjust_throughput:
-            self.log_evaluate_save_time = 0
-        else:
-            self.log_evaluate_save_time = None
-
-        hb_profiler = HabanaProfile(
-            warmup=self.args.profiling_warmup_steps,
-            active=self.args.profiling_steps,
-            record_shapes=self.args.profiling_record_shapes,
-        )
-        hb_profiler.start()
-
-        total_batched_samples = 0
-        if _is_peft_model(self.model) and self.model.peft_type == PeftType.ADALORA:
-            self.model.base_model.peft_config[self.model.trainable_adapter_name].total_step = max_steps
-            if max_steps < self.model.base_model.peft_config[self.model.trainable_adapter_name].tfinal:
-                self.model.base_model.peft_config[self.model.trainable_adapter_name].tfinal = 0
-        for epoch in range(epochs_trained, num_train_epochs):
-            epoch_iterator = train_dataloader
-            if hasattr(epoch_iterator, "set_epoch"):
-                epoch_iterator.set_epoch(epoch)
-
-            # Reset the past mems state at the beginning of each epoch if necessary.
-            if args.past_index >= 0:
-                self._past = None
-
-            steps_in_epoch = (
-                len(epoch_iterator)
-                if len_dataloader is not None
-                else args.max_steps * args.gradient_accumulation_steps
-            )
-            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
-
-            if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
-                self._load_rng_state(resume_from_checkpoint)
-
-            rng_to_sync = False
-            steps_skipped = 0
-            if steps_trained_in_current_epoch > 0:
-                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
-                steps_skipped = steps_trained_in_current_epoch
-                steps_trained_in_current_epoch = 0
-                rng_to_sync = True
-
-            step = -1
-            for step, inputs in enumerate(epoch_iterator):
-                if (
-                    args.throughput_warmup_steps > 0
-                    and (args.throughput_warmup_steps * args.gradient_accumulation_steps)
-                    == epoch * steps_in_epoch + step
-                ):
-                    start_time_after_warmup = time.time()
-
-                total_batched_samples += 1
-
-                if self.args.include_num_input_tokens_seen:
-                    main_input_name = getattr(self.model, "main_input_name", "input_ids")
-                    if main_input_name not in inputs:
-                        logger.warning(
-                            "Tried to track the number of tokens seen, however the current model is "
-                            "not configured properly to know what item is the input. To fix this, add "
-                            "a `main_input_name` attribute to the model class you are using."
-                        )
-                    else:
-                        input_device = inputs[main_input_name].device
-                        self.state.num_input_tokens_seen += torch.sum(
-                            self.accelerator.gather(
-                                torch.tensor(inputs[main_input_name].numel(), device=input_device, dtype=torch.int64)
-                            )
-                        ).item()
-                if rng_to_sync:
-                    self._load_rng_state(resume_from_checkpoint)
-                    rng_to_sync = False
-
-                # Skip past any already trained steps if resuming training
-                if steps_trained_in_current_epoch > 0:
-                    steps_trained_in_current_epoch -= 1
-                    if steps_trained_progress_bar is not None:
-                        steps_trained_progress_bar.update(1)
-                    if steps_trained_in_current_epoch == 0:
-                        self._load_rng_state(resume_from_checkpoint)
-                    continue
-                elif steps_trained_progress_bar is not None:
-                    steps_trained_progress_bar.close()
-                    steps_trained_progress_bar = None
-
-                if step % args.gradient_accumulation_steps == 0:
-                    self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
-
-                # attn_softmax_bf16 and use_flash_attention is enabled only for llama and qwen2
-                if hasattr(self.model, "generation_config") and self.model.generation_config is not None:
-                    if self.model.config.model_type in ["llama", "qwen2"]:
-                        if self.model.generation_config.attn_softmax_bf16:
-                            inputs["attn_softmax_bf16"] = True
-                        if self.model.generation_config.use_flash_attention:
-                            inputs["use_flash_attention"] = True
-                        if self.model.generation_config.flash_attention_recompute:
-                            inputs["flash_attention_recompute"] = True
-                        if self.model.generation_config.flash_attention_causal_mask:
-                            inputs["flash_attention_causal_mask"] = True
-
-                # TODO: keep syncs for fast DDP?
-                with self.accelerator.accumulate(model):
-                    tr_loss_step = self.training_step(model, inputs)
-
-                is_last_step_and_steps_less_than_grad_acc = (
-                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
-                )
-
-                is_optimization_step = (
-                    total_batched_samples % args.gradient_accumulation_steps == 0
-                    or
-                    # last step in epoch but step is always smaller than gradient_accumulation_steps
-                    is_last_step_and_steps_less_than_grad_acc
-                )
-
-                if (
-                    args.parallel_mode == ParallelMode.DISTRIBUTED
-                    and args.distribution_strategy == "fast_ddp"
-                    and is_optimization_step
-                ):
-                    all_reduce_gradients(
-                        model, use_hpu_graphs=True
-                    )  # use HPU graphs for gradient fusion regardless of args.use_hpu_graphs_for_training setting
-
-                if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)):
-                    # if loss is nan or inf simply add the average of previous logged losses
-                    tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
-                else:
-                    if tr_loss.device != tr_loss_step.device:
-                        raise ValueError(
-                            f"Calculated loss must be on the original device: {tr_loss.device} but device in use is {tr_loss_step.device}"
-                        )
-                    tr_loss += tr_loss_step
-
-                self.current_flos += float(self.floating_point_ops(inputs))
-                if args.use_lazy_mode:
-                    self.htcore.mark_step()
-
-                if is_optimization_step:
-                    # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
-                    # in accelerate. So, explicitly enable sync gradients to True in that case.
-                    if is_last_step_and_steps_less_than_grad_acc:
-                        self.accelerator.gradient_state._set_sync_gradients(True)
-
-                    # Gradient clipping
-                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
-                        # deepspeed does its own clipping
-
-                        if self.gaudi_config.use_fused_clip_norm and args.use_habana:
-                            # TODO: to merge self.accelerator.clip_grad_norm_ when HMP is removed
-                            _grad_norm = self.FusedNorm.clip_norm(model.parameters())
-                        else:
-                            # Revert to normal clipping otherwise
-                            _grad_norm = self.accelerator.clip_grad_norm_(
-                                model.parameters(),
-                                args.max_grad_norm,
-                            )
-
-                    # Optimizer step
-                    optimizer_was_run = True
-                    self.optimizer.step()
-                    optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
-                    if optimizer_was_run:
-                        # Delay optimizer scheduling until metrics are generated
-                        if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
-                            self.lr_scheduler.step()
-                    self._zero_model_grad(model)
-
-                    self.state.global_step += 1
-                    self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
-                    if args.use_lazy_mode:
-                        self.htcore.mark_step()
-                    self.control = self.callback_handler.on_step_end(args, self.state, self.control)
-
-                    self._maybe_log_save_evaluate(tr_loss, _grad_norm, model, trial, epoch, ignore_keys_for_eval)
-                else:
-                    self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
-
-                hb_profiler.step()
-                if self.control.should_epoch_stop or self.control.should_training_stop:
-                    break
-            if step < 0:
-                logger.warning(
-                    "There seems to be not a single sample in your epoch_iterator, stopping training at step"
-                    f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
-                    f" num_steps ({max_steps}) higher than the number of available samples."
-                )
-                self.control.should_training_stop = True
-
-            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-            self._maybe_log_save_evaluate(tr_loss, _grad_norm, model, trial, epoch, ignore_keys_for_eval)
-
-            if self.control.should_training_stop:
-                break
-
-        hb_profiler.stop()
-
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of training
-            delattr(self, "_past")
-
-        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
-        if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
-            # Wait for everyone to get here so we are sure the model has been saved by process 0.
-            if args.parallel_mode == ParallelMode.DISTRIBUTED:
-                torch.distributed.barrier()
-
-            self._load_best_model()
-
-        # add remaining tr_loss
-        self._total_loss_scalar += tr_loss.item()
-        effective_global_step = max(self.state.global_step, 0.001)  # Avoid ZeroDivisionError
-        train_loss = self._total_loss_scalar / effective_global_step
-
-        # Warmup steps are removed from the calculation of speed metrics
-        num_samples_for_speed_metrics = num_train_samples - args.throughput_warmup_steps * total_train_batch_size
-        num_steps_for_speed_metrics = self.state.max_steps - args.throughput_warmup_steps
-        metrics = speed_metrics(
-            "train",
-            start_time,
-            num_samples=num_samples_for_speed_metrics,
-            num_steps=num_steps_for_speed_metrics,
-            num_tokens=num_train_tokens,
-            start_time_after_warmup=start_time_after_warmup,
-            log_evaluate_save_time=self.log_evaluate_save_time,
-        )
-        self.store_flos()
-        metrics["total_flos"] = self.state.total_flos
-        metrics["train_loss"] = train_loss
-
-        self.is_in_train = False
-
-        self._memory_tracker.stop_and_update_metrics(metrics)
-
-        self.log(metrics)
-
-        run_dir = self._get_output_dir(trial)
-        checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
-
-        # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
-        if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
-            for checkpoint in checkpoints_sorted:
-                if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
-                    logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
-                    shutil.rmtree(checkpoint)
-
-        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
-
-        # Wait for the checkpoint to be uploaded.
-        self._finish_current_push()
-
-        # After training we make sure to retrieve back the original forward pass method
-        # for the embedding layer by removing the forward post hook.
-        if self.neftune_noise_alpha is not None:
-            self._deactivate_neftune(self.model)
-
-        return TrainOutput(self.state.global_step, train_loss, metrics)
-
-    def _load_best_model(self):
-        logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).")
-        best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
-        best_safe_model_path = os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_NAME)
-        best_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_WEIGHTS_NAME)
-        best_safe_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
-
-        model = self.model
-        # TODO: check if the code below works
-        # if self.is_deepspeed_enabled:
-        #     deepspeed_load_checkpoint(
-        #         self.model_wrapped,
-        #         self.state.best_model_checkpoint,
-        #         load_module_strict=not _is_peft_model(self.model),
-        #     )
-        # elif self.is_fsdp_enabled:
-        #     load_result = load_fsdp_model(
-        #         self.accelerator.state.fsdp_plugin,
-        #         self.accelerator,
-        #         model,
-        #         self.state.best_model_checkpoint,
-        #         **_get_fsdp_ckpt_kwargs(),
-        #     )
-        if (
-            os.path.exists(best_model_path)
-            or os.path.exists(best_safe_model_path)
-            or os.path.exists(best_adapter_model_path)
-            or os.path.exists(best_safe_adapter_model_path)
-        ):
-            has_been_loaded = True
-            if _is_peft_model(model):
-                # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
-                if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
-                    if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
-                        model.load_adapter(self.state.best_model_checkpoint, model.active_adapter)
-                        # Load_adapter has no return value present, modify it when appropriate.
-                        from torch.nn.modules.module import _IncompatibleKeys
-
-                        load_result = _IncompatibleKeys([], [])
-                    else:
-                        logger.warning(
-                            "The intermediate checkpoints of PEFT may not be saved correctly, "
-                            f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
-                            "Check some examples here: https://github.com/huggingface/peft/issues/96"
-                        )
-                        has_been_loaded = False
-                else:
-                    logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
-                    has_been_loaded = False
-            else:
-                # We load the model state dict on the CPU to avoid an OOM error.
-                if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
-                    state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
-                else:
-                    state_dict = torch.load(
-                        best_model_path,
-                        map_location="cpu",
-                        weights_only=True,
-                    )
-
-                # If the model is on the GPU, it still works!
-                load_result = model.load_state_dict(state_dict, False)
-
-            if has_been_loaded:
-                self._issue_warnings_after_load(load_result)
-        elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):
-            load_result = load_sharded_checkpoint(model, self.state.best_model_checkpoint, strict=False)
-            self._issue_warnings_after_load(load_result)
-        else:
-            logger.warning(
-                f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
-                "on multiple nodes, you should activate `--save_on_each_node`."
-            )
-
-    def _maybe_log_save_evaluate(self, tr_loss, _grad_norm, model, trial, epoch, ignore_keys_for_eval):
-        if self.args.adjust_throughput:
-            save_start = time.perf_counter()
-
-        if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
-            logs: Dict[str, float] = {}
-
-            # all_gather + mean() to get average loss over all processes
-            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
-
-            # reset tr_loss to zero
-            tr_loss -= tr_loss
-            logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
-
-            # This grad_norm block was outside of _maybe_log_save_evaluate method causing perf degradataion.
-            # Moving it here so the grad tensor is only copied when it's needed.
-            if is_accelerate_available() and self.accelerator.distributed_type == GaudiDistributedType.DEEPSPEED:
-                grad_norm = model.get_global_grad_norm()
-                # In some cases the grad norm may not return a float
-                if hasattr(grad_norm, "item"):
-                    grad_norm = grad_norm.item()
-            else:
-                if (
-                    _grad_norm is not None
-                    and self.accelerator.distributed_type != GaudiDistributedType.FSDP
-                    and _grad_norm.size() == torch.Size([1])
-                ):
-                    grad_norm = _grad_norm.item()
-                else:
-                    grad_norm = None
-
-            if grad_norm is not None:
-                logs["grad_norm"] = grad_norm
-            logs["learning_rate"] = self._get_learning_rate()
-
-            self._total_loss_scalar += tr_loss_scalar
-            self._globalstep_last_logged = self.state.global_step
-            self.store_flos()
-
-            self.log(logs)
-
-        metrics = None
-        if self.control.should_evaluate:
-            metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
-            self._report_to_hp_search(trial, self.state.global_step, metrics)
-
-            # Run delayed LR scheduler now that metrics are populated
-            if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
-                metric_to_check = self.args.metric_for_best_model
-                if not metric_to_check.startswith("eval_"):
-                    metric_to_check = f"eval_{metric_to_check}"
-                self.lr_scheduler.step(metrics[metric_to_check])
-
-        if self.control.should_save:
-            self._save_checkpoint(model, trial, metrics=metrics)
-            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
-
-        if self.args.adjust_throughput:
-            self.log_evaluate_save_time += time.perf_counter() - save_start
-
-    def _load_rng_state(self, checkpoint):
-        # Load RNG states from `checkpoint`
-        if checkpoint is None:
-            return
-
-        if self.args.world_size > 1:
-            process_index = self.args.process_index
-            rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth")
-            if not os.path.isfile(rng_file):
-                logger.info(
-                    f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
-                    "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
-                )
-                return
-        else:
-            rng_file = os.path.join(checkpoint, "rng_state.pth")
-            if not os.path.isfile(rng_file):
-                logger.info(
-                    "Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
-                    "fashion, reproducibility is not guaranteed."
-                )
-                return
-
-        checkpoint_rng_state = torch.load(rng_file)
-        random.setstate(checkpoint_rng_state["python"])
-        np.random.set_state(checkpoint_rng_state["numpy"])
-        torch.random.set_rng_state(checkpoint_rng_state["cpu"])
-        if self.args.use_habana:
-            if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
-                self.hpu_random.set_rng_state_all(checkpoint_rng_state["hpu"])
-            else:
-                try:
-                    self.hpu_random.set_rng_state(checkpoint_rng_state["hpu"])
-                except Exception as e:
-                    logger.info(
-                        f"Didn't manage to set back the RNG states of the HPU because of the following error:\n {e}"
-                        "\nThis won't yield the same results as if the training had not been interrupted."
-                    )
-
-    def _save_checkpoint(self, model, trial, metrics=None):
-        # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
-        # want to save except FullyShardedDDP.
-        # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
-
-        # Save model checkpoint
-        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
-
-        if self.hp_search_backend is None and trial is None:
-            self.store_flos()
-
-        run_dir = self._get_output_dir(trial=trial)
-        output_dir = os.path.join(run_dir, checkpoint_folder)
-        self.save_model(output_dir, _internal_call=True)
-
-        if not self.args.save_only_model:
-            # Save optimizer and scheduler
-            self._save_optimizer_and_scheduler(output_dir)
-            # Save RNG state
-            self._save_rng_state(output_dir)
-
-        # Determine the new best metric / best model checkpoint
-        if metrics is not None and self.args.metric_for_best_model is not None:
-            metric_to_check = self.args.metric_for_best_model
-            if not metric_to_check.startswith("eval_"):
-                metric_to_check = f"eval_{metric_to_check}"
-            metric_value = metrics[metric_to_check]
-
-            operator = np.greater if self.args.greater_is_better else np.less
-            if (
-                self.state.best_metric is None
-                or self.state.best_model_checkpoint is None
-                or operator(metric_value, self.state.best_metric)
-            ):
-                self.state.best_metric = metric_value
-                self.state.best_model_checkpoint = output_dir
-
-        # Save the Trainer state
-        if self.args.should_save:
-            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
-
-        if self.args.push_to_hub:
-            self._push_from_checkpoint(output_dir)
-
-        # Maybe delete some older checkpoints.
-        if self.args.should_save:
-            # Solely rely on numerical checkpoint id for rotation.
-            # mtime is not reliable especially on some fuse fs in cloud environments.
-            self._rotate_checkpoints(use_mtime=False, output_dir=run_dir)
-
-    def _save_rng_state(self, output_dir):
-        # Save RNG state in non-distributed training
-        rng_states = {
-            "python": random.getstate(),
-            "numpy": np.random.get_state(),
-            "cpu": torch.random.get_rng_state(),
-        }
-
-        if self.args.use_habana:
-            if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
-                # In non distributed, we save the global HPU RNG state
-                rng_states["hpu"] = self.hpu_random.get_rng_state_all()
-            else:
-                rng_states["hpu"] = self.hpu_random.get_rng_state()
-
-        # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
-        # not yet exist.
-        os.makedirs(output_dir, exist_ok=True)
-
-        if self.args.world_size <= 1:
-            torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
-        else:
-            torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.process_index}.pth"))
-
-    def _save_optimizer_and_scheduler(self, output_dir):
-        if self.is_deepspeed_enabled:
-            # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
-            # config `stage3_gather_16bit_weights_on_model_save` is True
-            accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
-                inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
-            )
-            if accept_exclude_frozen_parameters and _is_peft_model(self.model):
-                self.model_wrapped.save_checkpoint(output_dir, exclude_frozen_parameters=True)
-            else:
-                self.model_wrapped.save_checkpoint(output_dir)
-        elif self.is_fsdp_enabled:
-            # save fsdp specific ckpt for resuming from ckpt
-            save_fsdp_model(
-                self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir, **_get_fsdp_ckpt_kwargs()
-            )
-            save_fsdp_optimizer(
-                self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
-            )
-        elif self.args.should_save:
-            # deepspeed.save_checkpoint above saves model/optim/sched
-            # This block is exectuted by the main process only
-            optim_dict = self.optimizer.state_dict()
-            if self.args.use_habana:
-                # Move the state dict from HPU to CPU before saving
-                optim_dict = to_device_dtype(optim_dict, target_device=torch.device("cpu"))
-            torch.save(optim_dict, os.path.join(output_dir, OPTIMIZER_NAME))
-
-        # Save SCHEDULER & SCALER
-        is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
-            self.lr_scheduler, DeepSpeedSchedulerWrapper
-        )
-        if self.args.should_save and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler):
-            if self.args.use_habana:
-                # Move the state dict from HPU to CPU before saving
-                scheduler_dict = self.lr_scheduler.state_dict()
-                scheduler_dict = to_device_dtype(scheduler_dict, target_device=torch.device("cpu"))
-            with warnings.catch_warnings(record=True) as caught_warnings:
-                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-            reissue_pt_warnings(caught_warnings)
-
-    def _load_optimizer_and_scheduler(self, checkpoint):
-        """If optimizer and scheduler states exist, load them."""
-        if checkpoint is None:
-            return
-
-        if self.is_deepspeed_enabled:
-            # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
-            if not isinstance(self.lr_scheduler, DeepSpeedSchedulerWrapper):
-                with warnings.catch_warnings(record=True) as caught_warnings:
-                    self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME)))
-                reissue_pt_warnings(caught_warnings)
-            return
-
-        checkpoint_file_exists = (
-            os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME))
-            or os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME_BIN))
-            or (
-                os.path.isdir(checkpoint)
-                and any(
-                    OPTIMIZER_NAME_BIN.split(".")[0] in folder_name
-                    for folder_name in os.listdir(checkpoint)
-                    if os.path.isdir(os.path.join(checkpoint, folder_name))
-                )
-            )
-        )
-
-        if checkpoint_file_exists and os.path.isfile(os.path.join(checkpoint, SCHEDULER_NAME)):
-            # We use the CPU when training on one GPU to avoid OOM for GPU RAM when training big models.
-            # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
-            # likely to get OOM on CPU (since we load num_gpu times the optimizer state
-            map_location = "cpu" if self.args.use_habana else self.args.device
-            if self.is_fsdp_enabled:
-                load_fsdp_optimizer(
-                    self.accelerator.state.fsdp_plugin,
-                    self.accelerator,
-                    self.optimizer,
-                    self.model,
-                    checkpoint,
-                    **_get_fsdp_ckpt_kwargs(),
-                )
-            else:
-                self.optimizer.load_state_dict(
-                    torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
-                )
-
-            with warnings.catch_warnings(record=True) as caught_warnings:
-                self.lr_scheduler.load_state_dict(
-                    torch.load(os.path.join(checkpoint, SCHEDULER_NAME), map_location=map_location)
-                )
-            reissue_pt_warnings(caught_warnings)
-
-            # Move optimizer state to HPU
-            if self.args.use_habana:
-                to_device_dtype(self.optimizer.state.values(), target_device=torch.device("hpu"))
-
-    def log(self, logs: Dict[str, float]) -> None:
-        """
-        Log `logs` on the various objects watching training.
-        Subclass and override this method to inject custom behavior.
-        Args:
-            logs (`Dict[str, float]`):
-                The values to log.
-        """
-        if self.state.epoch is not None:
-            logs["epoch"] = self.state.epoch
-        if self.args.include_num_input_tokens_seen:
-            logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
-
-        mem_stats = get_hpu_memory_stats(self.args.device)
-        logs.update(mem_stats)
-
-        output = {**logs, **{"step": self.state.global_step}}
-        self.state.log_history.append(output)
-        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
-
-    def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]:
-        """
-        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
-        Compared to Transformers, it is also possible to enable non-blocking data copy.
-        """
-        if isinstance(data, Mapping):
-            return type(data)({k: self._prepare_input(v) for k, v in data.items()})
-        elif isinstance(data, (tuple, list)):
-            return type(data)(self._prepare_input(v) for v in data)
-        elif isinstance(data, torch.Tensor):
-            kwargs = {"device": self.args.device}
-            if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)):
-                # NLP models inputs are int/uint and those get adjusted to the right dtype of the
-                # embedding. Other models such as wav2vec2's inputs are already float and thus
-                # may need special handling to match the dtypes of the model
-                kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
-            if self.args.non_blocking_data_copy:
-                return data.to(**kwargs, non_blocking=True)
-            else:
-                return data.to(**kwargs)
-        return data
-
-    def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
-        """
-        A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
-        arguments, depending on the situation. Modified by Habana to enable using `autocast` on Gaudi devices.
-        """
-        if self.use_cpu_amp:
-            ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=torch.bfloat16)
-        elif self.use_hpu_amp:
-            ctx_manager = torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True)
-        else:
-            ctx_manager = contextlib.nullcontext()
-
-        # Merge autocast context and `fp8_autocast` context if FP8 is enabled.
-        # Currently FP8 is enabled only for training.
-        if self.accelerator.state.is_fp8_enabled and self.model.training:
-            ctx_manager = FP8ContextWrapper(ctx_manager, self.accelerator.fp8_recipe_handler)
-
-        return ctx_manager
-
-    def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
-        """
-        Perform a training step on a batch of inputs.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (`torch.nn.Module`):
-                The model to train.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument `labels`. Check your model's documentation for all accepted arguments.
-
-        Return:
-            `torch.Tensor`: The tensor with training loss on this batch.
-        """
-        model.train()
-        inputs = self._prepare_inputs(inputs)
-
-        with self.compute_loss_context_manager():
-            loss = self.compute_loss(model, inputs)
-
-        if self.args.n_gpu > 1:
-            loss = loss.mean()  # mean() to average on multi-gpu parallel training
-
-        if self.args.use_lazy_mode and self.args.pipelining_fwd_bwd:
-            self.htcore.mark_step()
-
-        if _is_peft_model(self.model) and self.model.peft_type == PeftType.ADALORA:
-            assert not (
-                self.accelerator.state.is_fp8_enabled and self.args.gradient_checkpointing
-            ), "FP8 precision with gradient_checkpointing is currently not supported with PeftType.ADALORA"
-            if self.is_deepspeed_enabled and not is_deepspeed_zero3_enabled():
-                self.accelerator.deepspeed_engine_wrapped.engine.backward(loss)
-                self.model.base_model.update_and_allocate(self.state.global_step)
-                self.accelerator.deepspeed_engine_wrapped.engine.step()
-            else:
-                self.accelerator.backward(loss)
-                self.model.base_model.update_and_allocate(self.state.global_step)
-        else:
-            if self.accelerator.state.is_fp8_enabled and self.args.gradient_checkpointing:
-                # The precision used in backward pass should be same as the one used in forward pass.
-                # However when training with gradient_checkpointing and FP8 precision, recompute forward
-                # in backward does not automatically run with FP8 precision. In order to handle this,
-                # the backward is run in `fp8_autocast` context
-                with FP8ContextWrapper.create_fp8_context(self.accelerator.fp8_recipe_handler):
-                    self.accelerator.backward(loss)
-            else:
-                self.accelerator.backward(loss)
-        return loss.detach() / self.args.gradient_accumulation_steps
-
-    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
-        """
-        Will save the model, so you can reload it using `from_pretrained()`.
-        Will only save from the main process.
-        """
-        if output_dir is None:
-            output_dir = self.args.output_dir
-
-        if self.is_fsdp_enabled:
-            if "FULL_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type):
-                state_dict = self.accelerator.get_state_dict(self.model)
-                if self.args.should_save:
-                    self._save(output_dir, state_dict=state_dict)
-        elif self.is_deepspeed_enabled:
-            try:
-                state_dict = self.accelerator.get_state_dict(self.deepspeed)
-                if self.args.should_save:
-                    self._save(output_dir, state_dict=state_dict)
-            except ValueError:
-                logger.warning(
-                    " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
-                    " zero_to_fp32.py to recover weights"
-                )
-                if self.args.should_save:
-                    self._save(output_dir, state_dict={})
-                # remove the dummy state_dict
-                remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
-                accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
-                    inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
-                )
-                if accept_exclude_frozen_parameters and _is_peft_model(self.model):
-                    self.model_wrapped.save_checkpoint(output_dir, exclude_frozen_parameters=True)
-                else:
-                    self.model_wrapped.save_checkpoint(output_dir)
-        elif self.args.should_save:
-            self._save(output_dir)
-
-        # Push to the Hub when `save_model` is called by the user.
-        if self.args.push_to_hub and not _internal_call:
-            self.push_to_hub(commit_message="Model save")
-
-    def _save(self, output_dir: Optional[str] = None, state_dict=None):
-        # If we are executing this function, we are the process zero, so we don't check for that.
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        os.makedirs(output_dir, exist_ok=True)
-        logger.info(f"Saving model checkpoint to {output_dir}")
-
-        supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
-
-        if state_dict is None:
-            state_dict = self.model.state_dict()
-        if state_dict and self.args.use_habana:
-            # state_dict items have to be saved on the CPU
-            state_dict = to_device_dtype(state_dict, target_device=torch.device("cpu"))
-
-        # Save a trained model and configuration using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        if not isinstance(self.model, supported_classes):
-            if isinstance(unwrap_model(self.model), supported_classes):
-                unwrap_model(self.model).save_pretrained(
-                    output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
-                )
-            else:
-                logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
-                if self.args.save_safetensors:
-                    safetensors.torch.save_file(
-                        state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
-                    )
-                else:
-                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
-        else:
-            self.model.save_pretrained(
-                output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
-            )
-
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(output_dir)
-
-        self.gaudi_config.save_pretrained(output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
-
-    def evaluate(
-        self,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> Dict[str, float]:
-        """
-        From https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/trainer.py#L3162 with the following modification
-        1. comment out TPU related
-        2. use throughput_warmup_steps in evaluation throughput calculation
-        """
-        # handle multipe eval datasets
-        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
-        if isinstance(eval_dataset, dict):
-            metrics = {}
-            for eval_dataset_name, _eval_dataset in eval_dataset.items():
-                dataset_metrics = self.evaluate(
-                    eval_dataset=_eval_dataset,
-                    ignore_keys=ignore_keys,
-                    metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}",
-                )
-                metrics.update(dataset_metrics)
-            return metrics
-
-        # memory metrics - must set up as early as possible
-        self._memory_tracker.start()
-
-        eval_dataloader = self.get_eval_dataloader(eval_dataset)
-
-        start_time = time.time()
-        self.start_time_after_warmup = None
-
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        output = eval_loop(
-            eval_dataloader,
-            description="Evaluation",
-            # No point gathering the predictions if there are no metrics, otherwise we defer to
-            # self.args.prediction_loss_only
-            prediction_loss_only=True if self.compute_metrics is None else None,
-            ignore_keys=ignore_keys,
-            metric_key_prefix=metric_key_prefix,
-        )
-
-        total_batch_size = self.args.eval_batch_size * self.args.world_size
-        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
-            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
-        num_samples = output.num_samples - self.args.throughput_warmup_steps * total_batch_size
-        num_steps = math.ceil(output.num_samples / total_batch_size) - self.args.throughput_warmup_steps
-
-        output.metrics.update(
-            speed_metrics(
-                metric_key_prefix,
-                start_time,
-                num_samples=num_samples,
-                num_steps=num_steps,
-                start_time_after_warmup=self.start_time_after_warmup,
-            )
-        )
-
-        self.log(output.metrics)
-
-        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
-
-        self._memory_tracker.stop_and_update_metrics(output.metrics)
-
-        return output.metrics
-
-    def evaluation_loop(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
-        Works both with or without labels.
-        """
-        args = self.args
-
-        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
-
-        # if eval is called w/o train, handle model prep here
-        if self.is_deepspeed_enabled and self.deepspeed is None:
-            _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
-
-        model = self._wrap_model(self.model, training=False, dataloader=dataloader)
-
-        if len(self.accelerator._models) == 0 and model is self.model:
-            model = (
-                self.accelerator.prepare(model)
-                if self.is_deepspeed_enabled
-                else self.accelerator.prepare_model(model, evaluation_mode=True)
-            )
-
-            if self.is_fsdp_enabled:
-                self.model = model
-
-            # for the rest of this function `model` is the outside model, whether it was wrapped or not
-            if model is not self.model:
-                self.model_wrapped = model
-
-            # backward compatibility
-            if self.is_deepspeed_enabled:
-                self.deepspeed = self.model_wrapped
-
-        model.eval()
-
-        # Do not use HPU graphs if the training is ongoing because it detaches gradients
-        if args.use_hpu_graphs_for_inference and not self.is_in_train:
-            logger.info("Using HPU graphs for inference.")
-            # Do not wrap the model in HPU graphs if it has already been done
-            if not self.already_wrapped_for_hpu_graphs:
-                from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-                model = wrap_in_hpu_graph(
-                    model, disable_tensor_cache=args.disable_tensor_cache_hpu_graphs, max_graphs=args.max_hpu_graphs
-                )
-                self.already_wrapped_for_hpu_graphs = True
-
-        # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
-        # while ``train`` is running, cast it to the right dtype first and then put on device
-        if not self.is_in_train:
-            if args.fp16_full_eval:
-                model = model.to(dtype=torch.float16, device=args.device)
-            elif args.bf16_full_eval:
-                model = model.to(dtype=torch.bfloat16, device=args.device)
-
-        batch_size = self.args.eval_batch_size
-
-        logger.info(f"***** Running {description} *****")
-        if has_length(dataloader):
-            logger.info(f"  Num examples = {self.num_examples(dataloader)}")
-        else:
-            logger.info("  Num examples: Unknown")
-        logger.info(f"  Batch size = {batch_size}")
-
-        self.callback_handler.eval_dataloader = dataloader
-        # Do this before wrapping.
-        eval_dataset = getattr(dataloader, "dataset", None)
-
-        if args.past_index >= 0:
-            self._past = None
-
-        # Initialize containers
-        all_losses = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
-        all_preds = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
-        all_labels = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
-        all_inputs = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
-
-        # Will be useful when we have an iterable dataset so don't know its length.
-        observed_num_examples = 0
-
-        # Main evaluation loop
-        for step, inputs in enumerate(dataloader):
-            if (
-                self.args.throughput_warmup_steps > 0
-                and not self.is_in_train
-                and step == self.args.throughput_warmup_steps
-            ):
-                self.start_time_after_warmup = time.time()
-            # Update the observed num examples
-            observed_batch_size = find_batch_size(inputs)
-            if observed_batch_size is not None:
-                observed_num_examples += observed_batch_size
-                # For batch samplers, batch_size is not known by the dataloader in advance.
-                if batch_size is None:
-                    batch_size = observed_batch_size
-
-            # attn_softmax_bf16 and use_flash_attention are enabled only for llama and qwen2
-            if hasattr(self.model, "generation_config") and self.model.generation_config is not None:
-                if self.model.config.model_type in ["llama", "qwen2"]:
-                    if self.model.generation_config.attn_softmax_bf16:
-                        inputs["attn_softmax_bf16"] = True
-                    if self.model.generation_config.use_flash_attention:
-                        inputs["use_flash_attention"] = True
-                    if self.model.generation_config.flash_attention_recompute:
-                        inputs["flash_attention_recompute"] = True
-                    if self.model.generation_config.flash_attention_causal_mask:
-                        inputs["flash_attention_causal_mask"] = True
-
-            # Prediction step
-            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            main_input_name = getattr(self.model, "main_input_name", "input_ids")
-            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
-
-            # Save the logits dtype since we need to convert them into floats during the process
-            # They will be converted back into their original dtype right before computing metrics
-            if logits is not None:
-                logits_dtype = get_dtype(logits)
-
-            # Update containers
-            if loss is not None:
-                # Handle NaN loss
-                if args.logging_nan_inf_filter and (torch.isnan(loss) or torch.isinf(loss)):
-                    # If loss is NaN or Inf, use a small constant
-                    loss = torch.tensor(1e-8, device=loss.device)
-
-                losses = self.gather_function((loss.repeat(batch_size)))
-                all_losses.add(losses)
-            if labels is not None:
-                labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
-                labels = self.gather_function((labels))
-                all_labels.add(labels)
-            if inputs_decode is not None:
-                inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
-                inputs_decode = self.gather_function((inputs_decode))
-                all_inputs.add(inputs_decode)
-            if logits is not None:
-                if args.use_habana and logits_dtype != "float32":
-                    logits = to_device_dtype(logits, target_dtype=torch.float32)
-                logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
-                if self.preprocess_logits_for_metrics is not None:
-                    logits = self.preprocess_logits_for_metrics(logits, labels)
-                logits = self.gather_function((logits))
-                all_preds.add(logits)
-
-            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
-
-            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
-                all_losses.to_cpu_and_numpy()
-                all_preds.to_cpu_and_numpy()
-                all_labels.to_cpu_and_numpy()
-                all_inputs.to_cpu_and_numpy()
-
-            # nested concat does accumulation on tensors of variable length.
-            # Added mark step here to avoid graph recompile
-            if args.use_lazy_mode:
-                self.htcore.mark_step()
-
-        # After all calls to `.gather_function`, reset to `gather_for_metrics`:
-        self.gather_function = self.accelerator.gather_for_metrics
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of the evaluation loop
-            delattr(self, "_past")
-
-        # Gather all remaining tensors and put them back on the CPU
-        all_losses = all_losses.get_arrays()
-        all_preds = all_preds.get_arrays()
-        all_labels = all_labels.get_arrays()
-        all_inputs = all_inputs.get_arrays()
-
-        # Number of samples
-        if has_length(eval_dataset):
-            num_samples = len(eval_dataset)
-        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
-        # methods. Therefore we need to make sure it also has the attribute.
-        elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
-            num_samples = eval_dataset.num_examples
-        else:
-            if has_length(dataloader):
-                num_samples = self.num_examples(dataloader)
-            else:  # both len(dataloader.dataset) and len(dataloader) fail
-                num_samples = observed_num_examples
-        if num_samples == 0 and observed_num_examples > 0:
-            num_samples = observed_num_examples
-
-        # Convert predictions back into their original dtype if necessary
-        if all_preds is not None:
-            all_preds = convert_into_dtypes(all_preds, logits_dtype)
-
-        # Metrics!
-        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
-        else:
-            metrics = {}
-
-        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
-        metrics = denumpify_detensorize(metrics)
-
-        if isinstance(all_losses, list) and all_losses:
-            metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item()
-        elif isinstance(all_losses, np.ndarray):
-            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
-
-        # Prefix all keys with metric_key_prefix + '_'
-        for key in list(metrics.keys()):
-            if not key.startswith(f"{metric_key_prefix}_"):
-                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
-
-    def prediction_step(
-        self,
-        model: torch.nn.Module,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Perform an evaluation step on `model` using `inputs`.
-        Subclass and override to inject custom behavior.
-        Args:
-            model (`torch.nn.Module`):
-                The model to evaluate.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument `labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (`bool`):
-                Whether or not to return the loss only.
-            ignore_keys (`List[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-        Return:
-            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
-            logits and labels (each being optional).
-        """
-        has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names)
-        # For CLIP-like models capable of returning loss values.
-        # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
-        # is `True` in `model.forward`.
-        return_loss = inputs.get("return_loss", None)
-        if return_loss is None:
-            return_loss = self.can_return_loss
-        loss_without_labels = True if len(self.label_names) == 0 and return_loss else False
-
-        inputs = self._prepare_inputs(inputs)
-        if ignore_keys is None:
-            if hasattr(self.model, "config"):
-                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
-            else:
-                ignore_keys = []
-
-        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
-        if has_labels or loss_without_labels:
-            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
-            if len(labels) == 1:
-                labels = labels[0]
-        else:
-            labels = None
-
-        with torch.no_grad():
-            try:
-                if has_labels or loss_without_labels:
-                    with self.compute_loss_context_manager():
-                        loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
-                    loss = loss.mean().detach()
-
-                    if isinstance(outputs, dict):
-                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
-                    else:
-                        logits = outputs[1:]
-                else:
-                    loss = None
-                    with self.compute_loss_context_manager():
-                        outputs = model(**inputs)
-                    if isinstance(outputs, dict):
-                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
-                    else:
-                        logits = outputs
-                    # TODO: this needs to be fixed and made cleaner later.
-                    if self.args.past_index >= 0:
-                        self._past = outputs[self.args.past_index - 1]
-            except RuntimeError as error:
-                if "cpu fallback is not supported during hpu graph capturing" in str(error):
-                    error.args = (
-                        f"{error}. You should run inference in lazy mode only with `use_lazy_mode=True` and `use_hpu_graphs_for_inference=False`.",
-                    )
-                raise error
-
-        if self.args.use_lazy_mode and not (self.args.use_hpu_graphs_for_inference and not self.is_in_train):
-            self.htcore.mark_step()
-
-        if prediction_loss_only:
-            return (loss, None, None)
-
-        logits = nested_detach(logits)
-        if len(logits) == 1:
-            logits = logits[0]
-
-        return (loss, logits, labels)
-
-    def _push_from_checkpoint(self, checkpoint_folder):
-        # Only push from one node.
-        if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
-            return
-        # If we haven't finished the last push, we don't do this one unless args.hub_always_push=True.
-        if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
-            return
-
-        output_dir = self.args.output_dir
-        # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
-        modeling_files = [CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME, GAUDI_CONFIG_NAME]
-        for modeling_file in modeling_files:
-            if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
-                shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
-        # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure.
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(output_dir)
-        # Same for the training arguments
-        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
-
-        if self.args.save_strategy == IntervalStrategy.STEPS:
-            commit_message = f"Training in progress, step {self.state.global_step}"
-        else:
-            commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
-
-        model_push_job = upload_folder(
-            repo_id=self.hub_model_id,
-            folder_path=output_dir,
-            commit_message=commit_message,
-            token=self.args.hub_token,
-            run_as_future=True,
-            ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
-        )
-
-        push_jobs = [model_push_job]
-
-        if self.args.hub_strategy in [HubStrategy.CHECKPOINT, HubStrategy.ALL_CHECKPOINTS]:
-            path_in_repo = (
-                "last-checkpoint" if self.args.hub_strategy == HubStrategy.CHECKPOINT else Path(checkpoint_folder).name
-            )
-            checkpoint_push = upload_folder(
-                repo_id=self.hub_model_id,
-                folder_path=checkpoint_folder,
-                path_in_repo=path_in_repo,
-                commit_message=commit_message + ", checkpoint",
-                token=self.args.hub_token,
-                run_as_future=True,
-            )
-            push_jobs.append(checkpoint_push)
-
-        if self.push_in_progress is None or self.push_in_progress.is_done():
-            self.push_in_progress = PushInProgress(push_jobs)
-        else:
-            self.push_in_progress.jobs.extend(push_jobs)
-
-    #
-    # Deprecated code
-    #
-    def prediction_loop(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
-        Works both with or without labels.
-        """
-        args = self.args
-
-        if not has_length(dataloader):
-            raise ValueError("dataloader must implement a working __len__")
-
-        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
-
-        # if eval is called w/o train, handle model prep here
-        if self.is_deepspeed_enabled and self.deepspeed is None:
-            _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
-
-        model = self._wrap_model(self.model, training=False, dataloader=dataloader)
-
-        if len(self.accelerator._models) == 0 and model is self.model:
-            model = (
-                self.accelerator.prepare(model)
-                if self.is_deepspeed_enabled
-                else self.accelerator.prepare_model(model, evaluation_mode=True)
-            )
-
-            if self.is_fsdp_enabled:
-                self.model = model
-
-            # for the rest of this function `model` is the outside model, whether it was wrapped or not
-            if model is not self.model:
-                self.model_wrapped = model
-
-            # backward compatibility
-            if self.is_deepspeed_enabled:
-                self.deepspeed = self.model_wrapped
-        model.eval()
-
-        # Do not use HPU graphs if the training is ongoing because it detaches gradients
-        if args.use_hpu_graphs_for_inference and not self.is_in_train:
-            logger.info("Using HPU graphs for inference.")
-            # Do not wrap the model in HPU graphs if it has already been done
-            if not self.already_wrapped_for_hpu_graphs:
-                from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-                model = wrap_in_hpu_graph(
-                    model, disable_tensor_cache=args.disable_tensor_cache_hpu_graphs, max_graphs=args.max_hpu_graphs
-                )
-                self.already_wrapped_for_hpu_graphs = True
-
-        # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
-        # while ``train`` is running, cast it to the right dtype first and then put on device
-        if not self.is_in_train:
-            if args.fp16_full_eval:
-                model = model.to(dtype=torch.float16, device=args.device)
-            elif args.bf16_full_eval:
-                model = model.to(dtype=torch.bfloat16, device=args.device)
-
-        batch_size = dataloader.batch_size
-        num_examples = self.num_examples(dataloader)
-        logger.info(f"***** Running {description} *****")
-        logger.info(f"  Num examples = {num_examples}")
-        logger.info(f"  Batch size = {batch_size}")
-
-        losses_host: torch.Tensor = None
-        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
-
-        world_size = max(1, args.world_size)
-
-        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
-        if not prediction_loss_only:
-            # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
-            # a batch size to the sampler)
-            make_multiple_of = None
-            if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
-                make_multiple_of = dataloader.sampler.batch_size
-            preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-            labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-            inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-
-        if args.past_index >= 0:
-            self._past = None
-
-        self.callback_handler.eval_dataloader = dataloader
-
-        for step, inputs in enumerate(dataloader):
-            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            main_input_name = getattr(self.model, "main_input_name", "input_ids")
-            inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None
-
-            if loss is not None:
-                losses = loss.repeat(batch_size)
-                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
-            if logits is not None:
-                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
-            if labels is not None:
-                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
-            if inputs_decode is not None:
-                inputs_host = (
-                    inputs_decode
-                    if inputs_host is None
-                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
-                )
-            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
-
-            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
-                eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
-                if not prediction_loss_only:
-                    preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
-                    labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
-                    inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
-
-                # Set back to None to begin a new accumulation
-                losses_host, preds_host, labels_host, inputs_host = None, None, None, None
-
-            # nested concat does accumulation on tensors of variable length.
-            # Added mark step here to avoid graph recompile
-            if args.use_lazy_mode:
-                self.htcore.mark_step()
-
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of the evaluation loop
-            delattr(self, "_past")
-
-        # Gather all remaining tensors and put them back on the CPU
-        eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
-        if not prediction_loss_only:
-            preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
-            labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
-            inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
-
-        eval_loss = eval_losses_gatherer.finalize()
-        preds = preds_gatherer.finalize() if not prediction_loss_only else None
-        label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
-        inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None
-
-        if self.compute_metrics is not None and preds is not None and label_ids is not None:
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=preds, label_ids=label_ids, inputs=inputs_ids)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
-        else:
-            metrics = {}
-
-        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
-        metrics = denumpify_detensorize(metrics)
-
-        if eval_loss is not None:
-            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()
-
-        # Prefix all keys with metric_key_prefix + '_'
-        for key in list(metrics.keys()):
-            if not key.startswith(f"{metric_key_prefix}_"):
-                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-        return EvalLoopOutput(predictions=preds, label_ids=label_ids, metrics=metrics, num_samples=num_examples)
-
-    def create_accelerator_and_postprocess(self):
-        grad_acc_kwargs = {}
-        if is_accelerate_available("0.28.0") and self.args.accelerator_config.gradient_accumulation_kwargs is not None:
-            grad_acc_kwargs = self.args.accelerator_config.gradient_accumulation_kwargs
-
-        # check if num_steps is attempted to be passed in gradient_accumulation_kwargs
-        if "num_steps" in grad_acc_kwargs and self.args.gradient_accumulation_steps > 1:
-            # raise because we do not know which setting is intended.
-            raise ValueError(
-                "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`"
-                "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`."
-            )
-        elif "num_steps" not in grad_acc_kwargs:
-            # take the gradient_accumulation_steps setting from TrainingArguments.
-            grad_acc_kwargs["num_steps"] = self.args.gradient_accumulation_steps
-
-        grad_acc_kwargs["sync_with_dataloader"] = False
-
-        gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
-
-        accelerator_config = self.args.accelerator_config.to_dict()
-
-        if is_accelerate_available("0.28.0"):
-            dataloader_config = DataLoaderConfiguration(
-                split_batches=accelerator_config.pop("split_batches"),
-                dispatch_batches=accelerator_config.pop("dispatch_batches"),
-                even_batches=accelerator_config.pop("even_batches"),
-                use_seedable_sampler=accelerator_config.pop("use_seedable_sampler"),
-            )
-        # this would have been updated above, no need for it anymore
-        accelerator_config.pop("gradient_accumulation_kwargs")
-
-        args = {
-            "deepspeed_plugin": self.args.deepspeed_plugin,
-            "gradient_accumulation_plugin": gradient_accumulation_plugin,
-            "distribution_strategy": self.args.distribution_strategy,
-        }
-        if is_accelerate_available("0.28.0"):
-            args["dataloader_config"] = dataloader_config
-        else:
-            args.update(accelerator_config)
-
-        # create accelerator object
-        self.accelerator = GaudiAccelerator(**args)
-        # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag
-        self.gather_function = self.accelerator.gather_for_metrics
-
-        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
-        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
-        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
-
-        # post accelerator creation setup
-        # copy of https://github.com/huggingface/transformers/blob/b71f20a7c9f3716d30f6738501559acf863e2c5c/src/transformers/trainer.py#L3991
-        # post accelerator creation setup
-        if self.is_fsdp_enabled:
-            fsdp_plugin = self.accelerator.state.fsdp_plugin
-            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get(
-                "limit_all_gathers", fsdp_plugin.limit_all_gathers
-            )
-            if is_accelerate_available("0.23.0"):
-                fsdp_plugin.activation_checkpointing = self.args.fsdp_config.get(
-                    "activation_checkpointing", fsdp_plugin.activation_checkpointing
-                )
-                if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
-                    raise ValueError(
-                        "The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg "
-                        "can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic "
-                        "when using FSDP."
-                    )
-
-        if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None:
-            self.propagate_args_to_deepspeed()
-
-        # `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end`
-        if (
-            self.args.save_only_model
-            and (self.is_deepspeed_enabled or self.is_fsdp_enabled)
-            and self.args.load_best_model_at_end
-        ):
-            wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
-            raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.")
-
-        # `auto_find_batch_size` isn't yet supported with DeepSpeed/FSDP
-        if (self.is_deepspeed_enabled or self.is_fsdp_enabled) and self.args.auto_find_batch_size:
-            wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
-            raise NotImplementedError(f"`{wrapper}` doesn't support `auto_find_batch_size`.")
-
-    def propagate_args_to_deepspeed(self, auto_find_batch_size=False):
-        """
-        Sets values in the deepspeed plugin based on the Trainer args
-        """
-        from .integrations.deepspeed import GaudiTrainerDeepSpeedConfig
-
-        ds_plugin = self.accelerator.state.deepspeed_plugin
-
-        ds_plugin.hf_ds_config = GaudiTrainerDeepSpeedConfig(ds_plugin.hf_ds_config.config)
-        ds_plugin.deepspeed_config = ds_plugin.hf_ds_config.config
-        ds_plugin.hf_ds_config.trainer_config_process(self.args, auto_find_batch_size)
-
-    def _zero_model_grad(self, model):
-        if hasattr(model, "_zero_grad_kwargs"):
-            model.zero_grad(**model._zero_grad_kwargs)
-        else:
-            # Optimization based on setting gradients to None (instead of zeroing them out) may only be used when gradients are not recorded using HPU graphs.
-            # HPU graphs rely on fixed tensors - setting gradients to None will enforce their re-allocation during the backward pass each step.
-            set_to_none = (
-                self.args.parallel_mode != ParallelMode.DISTRIBUTED or self.args.distribution_strategy == "ddp"
-            ) and not self.args.use_hpu_graphs_for_training
-
-            try:
-                model.zero_grad(set_to_none=set_to_none)
-                model._zero_grad_kwargs = {"set_to_none": set_to_none}
-            except TypeError:
-                model.zero_grad()
-                model._zero_grad_kwargs = {}
diff --git a/optimum/habana/transformers/trainer_seq2seq.py b/optimum/habana/transformers/trainer_seq2seq.py
deleted file mode 100644
index 52977e30a0..0000000000
--- a/optimum/habana/transformers/trainer_seq2seq.py
+++ /dev/null
@@ -1,395 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from copy import deepcopy
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-from torch.utils.data import Dataset
-from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-
-from optimum.utils import logging
-
-from .generation import GaudiGenerationConfig
-from .trainer import GaudiTrainer
-
-
-if TYPE_CHECKING:
-    from transformers.data.data_collator import DataCollator
-    from transformers.modeling_utils import PreTrainedModel
-    from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-    from transformers.trainer_callback import TrainerCallback
-    from transformers.trainer_utils import EvalPrediction, PredictionOutput
-
-    from .gaudi_configuration import GaudiConfig
-    from .training_args import GaudiTrainingArguments
-
-
-logger = logging.get_logger(__name__)
-
-
-class GaudiSeq2SeqTrainer(GaudiTrainer):
-    def __init__(
-        self,
-        model: Union["PreTrainedModel", torch.nn.Module] = None,
-        gaudi_config: "GaudiConfig" = None,
-        args: "GaudiTrainingArguments" = None,
-        data_collator: Optional["DataCollator"] = None,
-        train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
-        tokenizer: Optional["PreTrainedTokenizerBase"] = None,
-        model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
-        compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
-        callbacks: Optional[List["TrainerCallback"]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-    ):
-        super().__init__(
-            model=model,
-            gaudi_config=gaudi_config,
-            args=args,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            tokenizer=tokenizer,
-            model_init=model_init,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
-
-        # Override self.model.generation_config if a GenerationConfig is specified in args.
-        # Priority: args.generation_config > model.generation_config > default GenerationConfig.
-        if self.args.generation_config is not None:
-            gen_config = self.load_generation_config(self.args.generation_config)
-            self.model.generation_config = gen_config
-
-    @staticmethod
-    def load_generation_config(gen_config_arg: Union[str, GaudiGenerationConfig]) -> GaudiGenerationConfig:
-        """
-        Loads a `~generation.GaudiGenerationConfig` from the `GaudiSeq2SeqTrainingArguments.generation_config` arguments.
-
-        Args:
-            gen_config_arg (`str` or [`~generation.GaudiGenerationConfig`]):
-                `GaudiSeq2SeqTrainingArguments.generation_config` argument.
-
-        Returns:
-            A `~generation.GaudiGenerationConfig`.
-        """
-
-        # GenerationConfig provided, nothing to do
-        if isinstance(gen_config_arg, GaudiGenerationConfig):
-            gen_config = deepcopy(gen_config_arg)
-        else:
-            # str or Path
-            pretrained_model_name = Path(gen_config_arg) if isinstance(gen_config_arg, str) else gen_config_arg
-            config_file_name = None
-
-            # Figuring if it is path pointing to a file, pointing to a directory or else a model id or URL
-            # This step is required in order to determine config_file_name
-            if pretrained_model_name.is_file():
-                config_file_name = pretrained_model_name.name
-                pretrained_model_name = pretrained_model_name.parent
-            # dir path
-            elif pretrained_model_name.is_dir():
-                pass
-            # model id or URL
-            else:
-                pretrained_model_name = gen_config_arg
-
-            gen_config = GaudiGenerationConfig.from_pretrained(pretrained_model_name, config_file_name)
-
-        # Strict validation to fail early. `GenerationConfig.save_pretrained()`, run at the end of training, throws
-        # an exception if there are warnings at validation time.
-        try:
-            with warnings.catch_warnings(record=True) as caught_warnings:
-                gen_config.validate()
-            if len(caught_warnings) > 0:
-                raise ValueError(str([w.message for w in caught_warnings]))
-        except ValueError as exc:
-            raise ValueError(
-                "The loaded generation config instance is invalid -- `GenerationConfig.validate()` throws warnings "
-                "and/or exceptions. Fix these issues to train your model.\n\nThrown during validation:\n" + str(exc)
-            )
-        return gen_config
-
-    def evaluate(
-        self,
-        eval_dataset: Optional[Dataset] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-        **gen_kwargs,
-    ) -> Dict[str, float]:
-        """
-        Run evaluation and returns metrics.
-        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
-        (pass it to the init `compute_metrics` argument).
-        You can also subclass and override this method to inject custom behavior.
-        Args:
-            eval_dataset (`Dataset`, *optional*):
-                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
-                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
-                method.
-            ignore_keys (`List[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
-                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "eval_bleu" if the prefix is `"eval"` (default)
-            max_length (`int`, *optional*):
-                The maximum target length to use when predicting with the generate method.
-            num_beams (`int`, *optional*):
-                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
-                beam search.
-            gen_kwargs:
-                Additional `generate` specific kwargs.
-        Returns:
-            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
-            dictionary also contains the epoch number which comes from the training state.
-        """
-
-        gen_kwargs = gen_kwargs.copy()
-
-        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
-        # training args
-        if (
-            gen_kwargs.get("max_length") is None
-            and gen_kwargs.get("max_new_tokens") is None
-            and self.args.generation_max_length is not None
-        ):
-            gen_kwargs["max_length"] = self.args.generation_max_length
-        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
-            gen_kwargs["num_beams"] = self.args.generation_num_beams
-        # We don't want to drop samples in general
-        self.gather_function = self.accelerator.gather
-        self._gen_kwargs = gen_kwargs
-        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
-
-    def predict(
-        self,
-        test_dataset: Dataset,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "test",
-        **gen_kwargs,
-    ) -> "PredictionOutput":
-        """
-        Run prediction and returns predictions and potential metrics.
-        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
-        will also return metrics, like in `evaluate()`.
-        Args:
-            test_dataset (`Dataset`):
-                Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
-                `model.forward()` method are automatically removed. Has to implement the method `__len__`
-            ignore_keys (`List[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
-                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "eval_bleu" if the prefix is `"eval"` (default)
-            max_length (`int`, *optional*):
-                The maximum target length to use when predicting with the generate method.
-            num_beams (`int`, *optional*):
-                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
-                beam search.
-            gen_kwargs:
-                Additional `generate` specific kwargs.
-        <Tip>
-        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
-        padding in a token classification task) the predictions will be padded (on the right) to allow for
-        concatenation into one array. The padding index is -100.
-        </Tip>
-        Returns: *NamedTuple* A namedtuple with the following keys:
-            - predictions (`np.ndarray`): The predictions on `test_dataset`.
-            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
-            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
-              labels).
-        """
-
-        gen_kwargs = gen_kwargs.copy()
-
-        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
-        # training args
-        if (
-            gen_kwargs.get("max_length") is None
-            and gen_kwargs.get("max_new_tokens") is None
-            and self.args.generation_max_length is not None
-        ):
-            gen_kwargs["max_length"] = self.args.generation_max_length
-        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
-            gen_kwargs["num_beams"] = self.args.generation_num_beams
-        self.gather_function = self.accelerator.gather
-        self._gen_kwargs = gen_kwargs
-
-        return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
-
-    def prediction_step(
-        self,
-        model: torch.nn.Module,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-        **gen_kwargs,
-    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Perform an evaluation step on `model` using `inputs`.
-        Subclass and override to inject custom behavior.
-        Args:
-            model (`torch.nn.Module`):
-                The model to evaluate.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument `labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (`bool`):
-                Whether or not to return the loss only.
-            gen_kwargs:
-                Additional `generate` specific kwargs.
-        Return:
-            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
-            labels (each being optional).
-        """
-
-        if not self.args.predict_with_generate or prediction_loss_only:
-            return super().prediction_step(
-                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
-            )
-
-        has_labels = "labels" in inputs
-        inputs = self._prepare_inputs(inputs)
-
-        # Priority (handled in generate):
-        # non-`None` gen_kwargs > model.generation_config > default GenerationConfig()
-        if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"):
-            gen_kwargs = self._gen_kwargs.copy()
-        if "num_beams" in gen_kwargs and gen_kwargs["num_beams"] is None:
-            gen_kwargs.pop("num_beams")
-        if "max_length" in gen_kwargs and gen_kwargs["max_length"] is None:
-            gen_kwargs.pop("max_length")
-
-        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
-        gen_kwargs["synced_gpus"] = (
-            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
-        )
-        # pad batches to max_length on-the-fly in lazy mode
-        gen_kwargs["lazy_mode"] = (
-            gen_kwargs["lazy_mode"] if gen_kwargs.get("lazy_mode") is not None else self.args.use_lazy_mode
-        )
-        gen_kwargs["ignore_eos"] = (
-            gen_kwargs["ignore_eos"] if gen_kwargs.get("ignore_eos") is not None else self.args.ignore_eos
-        )
-        gen_kwargs["hpu_graphs"] = (
-            gen_kwargs["hpu_graphs"]
-            if gen_kwargs.get("hpu_graphs") is not None
-            else self.args.use_hpu_graphs_for_inference
-        )
-
-        generation_inputs = inputs.copy()
-        # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
-        # (otherwise, it would continue generating from the padded `decoder_input_ids`)
-        if (
-            "labels" in generation_inputs
-            and "decoder_input_ids" in generation_inputs
-            and generation_inputs["labels"].shape == generation_inputs["decoder_input_ids"].shape
-        ):
-            generation_inputs = {
-                k: v for k, v in inputs.items() if k not in ("decoder_input_ids", "decoder_attention_mask")
-            }
-        try:
-            with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.use_hpu_amp):
-                generated_tokens = self.model.generate(
-                    **generation_inputs,
-                    generation_config=self.model.generation_config,
-                    **gen_kwargs,
-                )
-        except RuntimeError as error:
-            if "cpu fallback is not supported during hpu graph capturing" in str(error):
-                error.args = (
-                    f"{error}. You should run inference in lazy mode only with `use_lazy_mode=True` and `use_hpu_graphs_for_inference=False`.",
-                )
-            raise error
-
-        # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
-        # TODO: remove this hack when the legacy code that initializes generation_config from a model config is
-        # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183
-        if self.model.generation_config._from_model_config:
-            self.model.generation_config._from_model_config = False
-
-        # Retrieves GenerationConfig from model.generation_config
-        gen_config = self.model.generation_config
-        # in case the batch is shorter than max length, the output should be padded
-        if gen_config.max_length is not None and generated_tokens.shape[-1] < gen_config.max_length:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
-        elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1)
-
-        # Different processes may have different generation work in eager mode, hence this sync
-        if not self.args.use_lazy_mode and self.args.local_rank != -1:
-            torch.distributed.barrier()
-
-        with torch.no_grad():
-            try:
-                if has_labels:
-                    with self.compute_loss_context_manager():
-                        outputs = model(**inputs)
-                    if self.label_smoother is not None:
-                        loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
-                    else:
-                        loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
-                else:
-                    loss = None
-            except RuntimeError as error:
-                if "cpu fallback is not supported during hpu graph capturing" in str(error):
-                    error.args = (
-                        f"{error}. You should run inference in lazy mode only with `use_lazy_mode=True` and `use_hpu_graphs_for_inference=False`.",
-                    )
-                raise error
-
-        if self.args.use_lazy_mode and not (self.args.use_hpu_graphs_for_inference and not self.is_in_train):
-            self.htcore.mark_step()
-
-        if self.args.prediction_loss_only:
-            return loss, None, None
-
-        if has_labels:
-            labels = inputs["labels"]
-            if gen_config.max_length is not None and labels.shape[-1] < gen_config.max_length:
-                labels = self._pad_tensors_to_max_len(labels, gen_config.max_length)
-            elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1:
-                labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1)
-        else:
-            labels = None
-
-        return loss, generated_tokens, labels
-
-    def _pad_tensors_to_max_len(self, tensor, max_length):
-        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
-            # If PAD token is not defined at least EOS token has to be defined
-            pad_token_id = (
-                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
-            )
-        else:
-            if self.model.config.pad_token_id is not None:
-                pad_token_id = self.model.config.pad_token_id
-            else:
-                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
-
-        padded_tensor = pad_token_id * torch.ones(
-            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
-        )
-        padded_tensor[:, : tensor.shape[-1]] = tensor
-        return padded_tensor
diff --git a/optimum/habana/transformers/trainer_utils.py b/optimum/habana/transformers/trainer_utils.py
deleted file mode 100644
index edc6bfe29e..0000000000
--- a/optimum/habana/transformers/trainer_utils.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Tuple, Union
-
-import numpy as np
-import torch
-
-
-def get_dtype(logits: Union[torch.Tensor, Tuple[torch.Tensor]]) -> Union[str, List[str]]:
-    """
-    Extract the dtype of logits.
-
-    Args:
-        logits (Union[torch.Tensor, Tuple[torch.Tensor]]): input
-
-    Raises:
-        TypeError: only torch.Tensor and tuple are supported
-
-    Returns:
-        Union[str, List[str]]: logits' dtype
-    """
-    if isinstance(logits, torch.Tensor):
-        # The dtype of a Torch tensor has the format 'torch.XXX', XXX being the actual dtype
-        logits_dtype = str(logits.dtype).split(".")[-1]
-        # If mixed-precision training was performed, dtype must be 'float32' to be understood by Numpy
-        if logits_dtype == "bfloat16":
-            logits_dtype = "float32"
-        return logits_dtype
-    elif isinstance(logits, tuple):
-        return get_dtype(logits[0])
-    elif isinstance(logits, dict):
-        return {k: get_dtype(v) for k, v in logits.items()}
-    else:
-        raise TypeError(f"logits should be of type torch.Tensor or tuple, got {type(logits)} which is not supported")
-
-
-def convert_into_dtypes(
-    preds: Union[np.ndarray, Tuple[np.ndarray]], dtype: str
-) -> Union[np.ndarray, Tuple[np.ndarray]]:
-    """
-    Convert preds into the target dtype.
-
-    Args:
-        preds (Union[np.ndarray, Tuple[np.ndarray]]): predictions to convert
-        dtype (str): dtype used for the conversion
-
-    Raises:
-        TypeError: only np.ndarray and tuple are supported
-
-    Returns:
-        Union[np.ndarray, Tuple[np.ndarray]]: converted preds
-    """
-    if isinstance(preds, np.ndarray):
-        if preds.dtype == dtype:
-            return preds
-        else:
-            return preds.astype(dtype)
-    elif isinstance(preds, tuple):
-        return tuple(convert_into_dtypes(preds_tensor, dtype) for preds_tensor in preds)
-    else:
-        raise TypeError(f"preds should be of type np.ndarray or tuple, got {type(preds)} which is not supported")
diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py
deleted file mode 100644
index 39520cb613..0000000000
--- a/optimum/habana/transformers/training_args.py
+++ /dev/null
@@ -1,865 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io
-import json
-import os
-import warnings
-from dataclasses import asdict, dataclass, field
-from datetime import timedelta
-from pathlib import Path
-from typing import Optional, Union
-
-from packaging import version
-from transformers.debug_utils import DebugOption
-from transformers.file_utils import cached_property, is_torch_available, requires_backends
-from transformers.trainer_pt_utils import AcceleratorConfig
-from transformers.trainer_utils import EvaluationStrategy, FSDPOption, HubStrategy, IntervalStrategy, SchedulerType
-from transformers.training_args import (
-    _VALID_DICT_FIELDS,
-    OptimizerNames,
-    ParallelMode,
-    TrainingArguments,
-    _convert_str_dict,
-    default_logdir,
-)
-from transformers.utils import (
-    ACCELERATE_MIN_VERSION,
-    get_full_repo_name,
-    is_accelerate_available,
-    is_safetensors_available,
-    strtobool,
-)
-
-from optimum.utils import logging
-
-from ..accelerate.state import GaudiAcceleratorState, GaudiPartialState
-from ..accelerate.utils import GaudiDistributedType
-from ..utils import get_habana_frameworks_version
-from .gaudi_configuration import GaudiConfig
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.get_logger(__name__)
-
-
-# List of arguments that are not supported by optimum-habana
-UNSUPPORTED_ARGUMENTS = [
-    "fp16",
-    "fp16_backend",
-    "fp16_full_eval",
-    "fp16_opt_level",
-    "mp_parameters",
-    "tf32",
-    "tpu_metrics_debug",
-    "tpu_num_cores",
-]
-
-
-# List of supported distribution strategies
-SUPPORTED_DISTRIBUTION_STRATEGIES = [
-    "ddp",  # default
-    "fast_ddp",
-]
-
-
-@dataclass
-class GaudiTrainingArguments(TrainingArguments):
-    """
-    GaudiTrainingArguments is built on top of the Tranformers' [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)
-    to enable deployment on Habana's Gaudi.
-
-    Args:
-        use_habana (`bool`, *optional*, defaults to `False`):
-            Whether to use Habana's HPU for running the model.
-        gaudi_config_name (`str`, *optional*):
-            Pretrained Gaudi config name or path.
-        use_lazy_mode (`bool`, *optional*, defaults to `True`):
-            Whether to use lazy mode for running the model.
-        use_hpu_graphs (`bool`, *optional*, defaults to `False`):
-            Deprecated, use `use_hpu_graphs_for_inference` instead. Whether to use HPU graphs for performing inference.
-        use_hpu_graphs_for_inference (`bool`, *optional*, defaults to `False`):
-            Whether to use HPU graphs for performing inference. It will speed up latency but may not be compatible with some operations.
-        use_hpu_graphs_for_training (`bool`, *optional*, defaults to `False`):
-            Whether to use HPU graphs for performing inference. It will speed up training but may not be compatible with some operations.
-        disable_tensor_cache_hpu_graphs (`bool`, *optional*, defaults to `False`):
-            Whether to disable tensor cache when using hpu graphs. If True, tensors won't be cached in hpu graph and memory can be saved.
-        max_hpu_graphs (`int`, *optional*):
-            Maximum number of hpu graphs to be cached. Reduce to save device memory.
-        distribution_strategy (`str`, *optional*, defaults to `ddp`):
-            Determines how data parallel distributed training is achieved. May be: `ddp` or `fast_ddp`.
-        throughput_warmup_steps (`int`, *optional*, defaults to 0):
-            Number of steps to ignore for throughput calculation. For example, with `throughput_warmup_steps=N`,
-            the first N steps will not be considered in the calculation of the throughput. This is especially
-            useful in lazy mode where the first two or three iterations typically take longer.
-        adjust_throughput ('bool', *optional*, defaults to `False`):
-            Whether to remove the time taken for logging, evaluating and saving from throughput calculation.
-        pipelining_fwd_bwd (`bool`, *optional*, defaults to `False`):
-            Whether to add an additional `mark_step` between forward and backward for pipelining
-            host backward building and HPU forward computing.
-        non_blocking_data_copy (`bool`, *optional*, defaults to `False`):
-            Whether to enable async data copy when preparing inputs.
-        profiling_warmup_steps (`int`, *optional*, defaults to 0):
-            Number of steps to ignore for profling.
-        profiling_steps (`int`, *optional*, defaults to 0):
-            Number of steps to be captured when enabling profiling.
-    """
-
-    use_habana: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to use Habana's HPU for running the model."},
-    )
-
-    gaudi_config_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained Gaudi config name or path."},
-    )
-
-    use_lazy_mode: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Whether to use lazy mode for running the model."},
-    )
-
-    use_hpu_graphs: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "Deprecated, use `use_hpu_graphs_for_inference` instead. Whether to use HPU graphs for performing inference."
-        },
-    )
-
-    use_hpu_graphs_for_inference: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "Whether to use HPU graphs for performing inference. It will speed up latency but may not be compatible with some operations."
-        },
-    )
-
-    use_hpu_graphs_for_training: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "Whether to use HPU graphs for performing training. It will speed up training but may not be compatible with some operations."
-        },
-    )
-
-    disable_tensor_cache_hpu_graphs: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to use a tensor cache for hpu graphs."},
-    )
-
-    max_hpu_graphs: Optional[int] = field(
-        default=None,
-        metadata={"help": "Maximum number of HPU graphs to use."},
-    )
-
-    distribution_strategy: Optional[str] = field(
-        default="ddp",
-        metadata={
-            "help": "Determines how distributed data parallel training is achieved. "
-            "Can be either `ddp` (i.e. using `DistributedDataParallel`) or "
-            "`fast_ddp` (i.e. using `optimum.habana.distributed.all_reduce_gradients`).",
-            "choices": ["ddp", "fast_ddp"],
-        },
-    )
-
-    throughput_warmup_steps: Optional[int] = field(
-        default=0,
-        metadata={
-            "help": (
-                "Number of steps to ignore for throughput calculation. For example, with `throughput_warmup_steps=N`,"
-                " the first N steps will not be considered in the calculation of the throughput. This is especially"
-                " useful in lazy mode where the first two or three iterations typically take longer."
-            )
-        },
-    )
-
-    adjust_throughput: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to remove the time taken for logging, evaluating and saving from throughput calculation."
-        },
-    )
-
-    pipelining_fwd_bwd: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to add an additional `mark_step` between forward and backward for pipelining "
-                "host backward building and HPU forward computing."
-            )
-        },
-    )
-
-    ignore_eos: Optional[bool] = field(
-        default=True,
-        metadata={"help": ("Whether to disable stopping with eos token when calling `generate`.")},
-    )
-
-    non_blocking_data_copy: Optional[bool] = field(
-        default=False,
-        metadata={"help": ("Whether to enable async data copy when preparing inputs.")},
-    )
-
-    profiling_warmup_steps: Optional[int] = field(
-        default=0,
-        metadata={"help": ("Number of steps to ignore for profling.")},
-    )
-
-    profiling_steps: Optional[int] = field(
-        default=0,
-        metadata={"help": ("Number of steps to be captured when enabling profiling.")},
-    )
-
-    profiling_record_shapes: Optional[bool] = field(
-        default=True,
-        metadata={"help": ("Record shapes when enabling profiling.")},
-    )
-    # Overriding the default value of optim because 'adamw_hf' is deprecated
-    optim: Optional[Union[OptimizerNames, str]] = field(
-        default="adamw_torch",
-        metadata={"help": "The optimizer to use."},
-    )
-
-    # Overriding the default value of epsilon to be consistent with Habana FusedAdamW
-    adam_epsilon: Optional[float] = field(
-        default=1e-6,
-        metadata={"help": "Epsilon for AdamW optimizer."},
-    )
-
-    # Overriding logging_nan_inf_filter to make False the default value
-    logging_nan_inf_filter: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Filter nan and inf losses for logging."},
-    )
-
-    # Overriding ddp_bucket_cap_mb to make 230 the default value
-    ddp_bucket_cap_mb: Optional[int] = field(
-        default=230,
-        metadata={
-            "help": (
-                "When using distributed training, the value of the flag `bucket_cap_mb` passed to "
-                "`DistributedDataParallel`."
-            )
-        },
-    )
-
-    # Overriding ddp_find_unused_parameters to make False the default value
-    ddp_find_unused_parameters: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": (
-                "When using distributed training, the value of the flag `find_unused_parameters` passed to "
-                "`DistributedDataParallel`."
-            )
-        },
-    )
-
-    # Overriding half_precision_backend to allow only CPU and HPU as possible mixed-precision backends for Torch Autocast.
-    half_precision_backend: str = field(
-        default="hpu_amp",
-        metadata={
-            "help": "The backend to be used for half precision.",
-            "choices": ["cpu_amp", "hpu_amp"],
-        },
-    )
-
-    # Overriding ddp_backend to replace all possible backends by hccl
-    ddp_backend: Optional[str] = field(
-        default="hccl",
-        metadata={
-            "help": "The backend to be used for distributed training.",
-            "choices": ["hccl"],
-        },
-    )
-
-    # Use this to override default attn_implementation in transformers
-    attn_implementation: Optional[str] = field(
-        default="eager",
-        metadata={
-            "help": "choose whether to use scale dot product attention (SDPA) or not.",
-            "choices": ["eager", "sdpa"],
-        },
-    )
-
-    fp8: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to use fp8 for training."},
-    )
-
-    fp8_recipe_format: Optional[str] = field(
-        default="E5M2",
-        metadata={
-            "help": "Which fp8 format to use for fp8 training.",
-            "choices": ["E5M2", "E4M3", "HYBRID"],
-        },
-    )
-
-    def __post_init__(self):
-        if self.use_hpu_graphs:
-            warnings.warn(
-                (
-                    "`--use_hpu_graphs` is deprecated and will be removed in a future version of 🤗 Optimum Habana. Use `--use_hpu_graphs_for_training` or `--use_hpu_graphs_for_inference` instead."
-                ),
-                FutureWarning,
-            )
-
-        use_hpu_graphs = self.use_hpu_graphs or self.use_hpu_graphs_for_inference or self.use_hpu_graphs_for_training
-
-        if (self.use_lazy_mode or use_hpu_graphs or self.gaudi_config_name) and not self.use_habana:
-            raise ValueError(
-                "`--use_lazy_mode`, `--use_hpu_graphs_for_inference`, `--use_hpu_graphs_for_training` and `--gaudi_config_name` cannot be used without `--use_habana`."
-            )
-        if use_hpu_graphs and (not self.use_lazy_mode and not self.torch_compile_backend):
-            raise ValueError(
-                "`--use_hpu_graphs_for_inference` and `--use_hpu_graphs_for_training` cannot be used in eager mode. Please set `--use_lazy_mode` to True."
-            )
-
-        if self.distribution_strategy not in SUPPORTED_DISTRIBUTION_STRATEGIES:
-            raise ValueError(
-                f"`--distribution_strategy` is {self.distribution_strategy} which is an invalid or unsupported value. Possible choices are: {', '.join(SUPPORTED_DISTRIBUTION_STRATEGIES)}."
-            )
-
-        if self.disable_tensor_cache_hpu_graphs and not use_hpu_graphs:
-            raise ValueError("must be using hpu graphs to set disable_tensor_cache_hpu_graphs.")
-
-        if self.max_hpu_graphs is not None and not use_hpu_graphs:
-            raise ValueError("must be using hpu graphs to set max_hpu_graphs.")
-
-        # Raise errors for arguments that are not supported by optimum-habana
-        if self.fp16 or self.fp16_full_eval:
-            raise ValueError(
-                "--fp16, --fp16_backend, --fp16_full_eval and --fp16_opt_level are not"
-                " supported by optimum-habana. Mixed-precision can be enabled in your Gaudi configuration."
-            )
-        if self.tpu_num_cores or self.tpu_metrics_debug:
-            raise ValueError("TPUs are not supported by optimum-habana.")
-        if self.mp_parameters:
-            raise ValueError("--mp_parameters is not supported by optimum-habana.")
-        if self.tf32:
-            raise ValueError("--tf32 is not supported by optimum-habana.")
-
-        if self.throughput_warmup_steps < 0:
-            raise ValueError("--throughput_warmup_steps must be positive.")
-
-        # Parse in args that could be `dict` sent in from the CLI as a string
-        for field in _VALID_DICT_FIELDS:
-            passed_value = getattr(self, field)
-            # We only want to do this if the str starts with a bracket to indiciate a `dict`
-            # else its likely a filename if supported
-            if isinstance(passed_value, str) and passed_value.startswith("{"):
-                loaded_dict = json.loads(passed_value)
-                # Convert str values to types if applicable
-                loaded_dict = _convert_str_dict(loaded_dict)
-                setattr(self, field, loaded_dict)
-
-        # expand paths, if not os.makedirs("~/bar") will make directory
-        # in the current directory instead of the actual home
-        # see https://github.com/huggingface/transformers/issues/10628
-        if self.output_dir is not None:
-            self.output_dir = os.path.expanduser(self.output_dir)
-        if self.logging_dir is None and self.output_dir is not None:
-            self.logging_dir = os.path.join(self.output_dir, default_logdir())
-        if self.logging_dir is not None:
-            self.logging_dir = os.path.expanduser(self.logging_dir)
-
-        if self.disable_tqdm is None:
-            self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
-
-        if isinstance(self.evaluation_strategy, EvaluationStrategy):
-            warnings.warn(
-                (
-                    "using `EvaluationStrategy` for `evaluation_strategy` is deprecated and will be removed in version"
-                    " 5 of 🤗 Transformers. Use `IntervalStrategy` instead"
-                ),
-                FutureWarning,
-            )
-            # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it.
-            self.evaluation_strategy = self.evaluation_strategy.value
-
-        self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
-        self.logging_strategy = IntervalStrategy(self.logging_strategy)
-        self.save_strategy = IntervalStrategy(self.save_strategy)
-        self.hub_strategy = HubStrategy(self.hub_strategy)
-
-        self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
-        if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO:
-            self.do_eval = True
-
-        # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
-        if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
-            if self.logging_steps > 0:
-                logger.info(f"using `logging_steps` to initialize `eval_steps` to {self.logging_steps}")
-                self.eval_steps = self.logging_steps
-            else:
-                raise ValueError(
-                    f"evaluation strategy {self.evaluation_strategy} requires either non-zero --eval_steps or"
-                    " --logging_steps"
-                )
-
-        # logging_steps must be non-zero for logging_strategy that is other than 'no'
-        if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
-            raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
-
-        if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1:
-            if self.logging_steps != int(self.logging_steps):
-                raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
-            self.logging_steps = int(self.logging_steps)
-        if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
-            if self.eval_steps != int(self.eval_steps):
-                raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
-            self.eval_steps = int(self.eval_steps)
-        if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
-            if self.save_steps != int(self.save_steps):
-                raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
-            self.save_steps = int(self.save_steps)
-
-        # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
-        if self.load_best_model_at_end:
-            if self.evaluation_strategy != self.save_strategy:
-                raise ValueError(
-                    "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation "
-                    f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
-                )
-            if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
-                if self.eval_steps < 1 or self.save_steps < 1:
-                    if not (self.eval_steps < 1 and self.save_steps < 1):
-                        raise ValueError(
-                            "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
-                            "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps "
-                            f"{self.save_steps} and eval_steps {self.eval_steps}."
-                        )
-                    # Work around floating point precision issues
-                    LARGE_MULTIPLIER = 1_000_000
-                    if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
-                        raise ValueError(
-                            "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
-                            f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
-                        )
-                raise ValueError(
-                    "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
-                    f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
-                )
-
-        safetensors_available = is_safetensors_available()
-        if self.save_safetensors and not safetensors_available:
-            raise ValueError(f"--save_safetensors={self.save_safetensors} requires safetensors to be installed!")
-        if not self.save_safetensors and safetensors_available:
-            logger.info(
-                f"Found safetensors installation, but --save_safetensors={self.save_safetensors}. "
-                f"Safetensors should be a preferred weights saving format due to security and performance reasons. "
-                f"If your model cannot be saved by safetensors please feel free to open an issue at "
-                f"https://github.com/huggingface/safetensors!"
-            )
-
-        if (
-            self.load_best_model_at_end or self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU
-        ) and self.metric_for_best_model is None:
-            self.metric_for_best_model = "loss"
-        if self.greater_is_better is None and self.metric_for_best_model is not None:
-            self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
-        if self.run_name is None:
-            self.run_name = self.output_dir
-
-        if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU:
-            if self.evaluation_strategy == IntervalStrategy.NO:
-                raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires an eval strategy")
-            if not is_torch_available():
-                raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires torch>=0.2.0")
-
-        self.optim = OptimizerNames(self.optim)
-        if self.adafactor:
-            warnings.warn(
-                (
-                    "`--adafactor` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--optim"
-                    " adafactor` instead"
-                ),
-                FutureWarning,
-            )
-            self.optim = OptimizerNames.ADAFACTOR
-        if self.optim == OptimizerNames.ADAMW_TORCH_FUSED and is_torch_available():
-            if version.parse(version.parse(torch.__version__).base_version) < version.parse("2.0.0"):
-                raise ValueError("--optim adamw_torch_fused requires PyTorch 2.0 or higher")
-
-        if (self.torch_compile_mode is not None or self.torch_compile_backend is not None) and not self.torch_compile:
-            assert get_habana_frameworks_version().minor > 12, "Torch compile is not available"
-            self.torch_compile = True
-            assert not os.getenv("PT_HPU_LAZY_MODE", "1") != "0", "Dynamo and lazy are mutually exclusive."
-            # Note: PT_HPU_LAZY_MODE=0 needs to be set before library is loaded,
-            #       setting it here would be too late - hence assertion.
-        if self.torch_compile and self.torch_compile_backend is None:
-            self.torch_compile_backend = "inductor"
-
-        # accelerate integration for torch compile
-        if self.torch_compile:
-            # set env vars for accelerate
-            prefix = "ACCELERATE_DYNAMO_"
-            os.environ[prefix + "BACKEND"] = self.torch_compile_backend
-            if self.torch_compile_mode is not None:
-                os.environ[prefix + "MODE"] = self.torch_compile_mode
-
-        # if training args is specified, it will override the one specified in the accelerate config
-        mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
-        if self.fp8:
-            mixed_precision_dtype = "fp8"
-        elif self.bf16:
-            mixed_precision_dtype = "bf16"
-        os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype
-
-        if self.report_to is None:
-            logger.info(
-                "The default value for the training argument `--report_to` will change in v5 (from all installed "
-                "integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as "
-                "now. You should start updating your code and make this info disappear :-)."
-            )
-            self.report_to = "all"
-        if self.report_to == "all" or self.report_to == ["all"]:
-            # Import at runtime to avoid a circular import.
-            from transformers.integrations import get_available_reporting_integrations
-
-            self.report_to = get_available_reporting_integrations()
-        elif self.report_to == "none" or self.report_to == ["none"]:
-            self.report_to = []
-        elif not isinstance(self.report_to, list):
-            self.report_to = [self.report_to]
-
-        if self.warmup_ratio < 0 or self.warmup_ratio > 1:
-            raise ValueError("warmup_ratio must lie in range [0,1]")
-        elif self.warmup_ratio > 0 and self.warmup_steps > 0:
-            logger.info(
-                "Both warmup_ratio and warmup_steps given, warmup_steps will override any effect of warmup_ratio"
-                " during training"
-            )
-
-        # Copy of https://github.com/huggingface/transformers/blob/b71f20a7c9f3716d30f6738501559acf863e2c5c/src/transformers/training_args.py#L1563
-        # except following changes, (1) Remove XLA specific code & (2) change fsdp_backward_prefetch to backward_prefetch
-        if isinstance(self.fsdp, bool):
-            self.fsdp = "full_shard" if self.fsdp else ""
-        if isinstance(self.fsdp, str):
-            self.fsdp = [FSDPOption(s) for s in self.fsdp.split()]
-        if self.fsdp == [FSDPOption.OFFLOAD]:
-            raise ValueError(
-                "`--fsdp offload` can't work on its own. It needs to be added to `--fsdp full_shard` or "
-                '`--fsdp shard_grad_op`. For example, `--fsdp "full_shard offload"`.'
-            )
-        elif FSDPOption.FULL_SHARD in self.fsdp and FSDPOption.SHARD_GRAD_OP in self.fsdp:
-            raise ValueError("`--fsdp full_shard` is not compatible with `--fsdp shard_grad_op`.")
-
-        if self.fsdp_config is None:
-            self.fsdp_config = {}
-
-        if isinstance(self.fsdp_config, str):
-            if len(self.fsdp) == 0:
-                warnings.warn("`--fsdp_config` is useful only when `--fsdp` is specified.")
-            with io.open(self.fsdp_config, "r", encoding="utf-8") as f:
-                self.fsdp_config = json.load(f)
-                for k in list(self.fsdp_config.keys()):
-                    if k.startswith("fsdp_"):
-                        v = self.fsdp_config.pop(k)
-                        self.fsdp_config[k[5:]] = v
-
-        if self.fsdp_min_num_params > 0:
-            warnings.warn("using `--fsdp_min_num_params` is deprecated. Use fsdp_config instead ", FutureWarning)
-
-        self.fsdp_config["min_num_params"] = max(self.fsdp_config.get("min_num_params", 0), self.fsdp_min_num_params)
-
-        # if fsdp_config["transformer_layer_cls_to_wrap"] is specified as a string, convert it to a list with a single object
-        if isinstance(self.fsdp_config.get("transformer_layer_cls_to_wrap", None), str):
-            self.fsdp_config["transformer_layer_cls_to_wrap"] = [self.fsdp_config["transformer_layer_cls_to_wrap"]]
-
-        if self.fsdp_transformer_layer_cls_to_wrap is not None:
-            warnings.warn(
-                "using `--fsdp_transformer_layer_cls_to_wrap` is deprecated. Use fsdp_config instead ", FutureWarning
-            )
-            self.fsdp_config["transformer_layer_cls_to_wrap"] = self.fsdp_config.get(
-                "transformer_layer_cls_to_wrap", []
-            ) + [self.fsdp_transformer_layer_cls_to_wrap]
-
-        if len(self.fsdp) == 0 and self.fsdp_config["min_num_params"] > 0:
-            warnings.warn("`min_num_params` is useful only when `--fsdp` is specified.")
-
-        if len(self.fsdp) == 0 and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None:
-            warnings.warn("`transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.")
-
-        if (
-            len(self.fsdp) > 0
-            and self.fsdp_config["min_num_params"] > 0
-            and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None
-        ):
-            raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.")
-        self.fsdp_config["xla"] = self.fsdp_config.get("xla", False)
-        self.fsdp_config["xla_fsdp_v2"] = self.fsdp_config.get("xla_fsdp_v2", False)
-        self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False)
-
-        # accelerate integration for FSDP
-        if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
-            os.environ["ACCELERATE_USE_FSDP"] = "true"
-            os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "true"
-            from accelerate.utils.constants import (
-                FSDP_AUTO_WRAP_POLICY,
-                FSDP_SHARDING_STRATEGY,
-            )
-
-            prefix = "FSDP_"
-            for fsdp_option in self.fsdp:
-                if fsdp_option.upper() in FSDP_SHARDING_STRATEGY:
-                    # set environment variable for FSDP sharding strategy
-                    os.environ[f"{prefix}SHARDING_STRATEGY"] = str(
-                        FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1
-                    )
-                elif fsdp_option == FSDPOption.OFFLOAD:
-                    os.environ[f"{prefix}OFFLOAD_PARAMS"] = "true"
-                elif fsdp_option == FSDPOption.AUTO_WRAP:
-                    os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0]
-                    if self.fsdp_config["min_num_params"] > 0:
-                        os.environ[f"{prefix}MIN_NUM_PARAMS"] = str(self.fsdp_config["min_num_params"])
-                        os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1]
-                    elif self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None:
-                        os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"] = ",".join(
-                            self.fsdp_config["transformer_layer_cls_to_wrap"]
-                        )
-            prefetch_policy = self.fsdp_config.get("backward_prefetch", "NO_PREFETCH")
-            os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper()
-            os.environ[f"{prefix}FORWARD_PREFETCH"] = str(self.fsdp_config.get("forward_prefetch", "false"))
-            os.environ[f"{prefix}SYNC_MODULE_STATES"] = str(self.fsdp_config.get("sync_module_states", "true"))
-            os.environ[f"{prefix}USE_ORIG_PARAMS"] = str(self.fsdp_config.get("use_orig_params", "true"))
-            os.environ[f"{prefix}ACTIVATION_CHECKPOINTING"] = str(
-                self.fsdp_config.get("activation_checkpointing", "false")
-            )
-
-        if is_accelerate_available():
-            if not isinstance(self.accelerator_config, (AcceleratorConfig)):
-                if self.accelerator_config is None:
-                    self.accelerator_config = AcceleratorConfig()
-                elif isinstance(self.accelerator_config, dict):
-                    self.accelerator_config = AcceleratorConfig(**self.accelerator_config)
-                # Check that a user didn't pass in the class instantiator
-                # such as `accelerator_config = AcceleratorConfig`
-                elif isinstance(self.accelerator_config, type):
-                    raise NotImplementedError(
-                        "Tried passing in a callable to `accelerator_config`, but this is not supported. "
-                        "Please pass in a fully constructed `AcceleratorConfig` object instead."
-                    )
-                else:
-                    self.accelerator_config = AcceleratorConfig.from_json_file(self.accelerator_config)
-            if self.dispatch_batches is not None:
-                warnings.warn(
-                    "Using `--dispatch_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use"
-                    " `--accelerator_config {'dispatch_batches':VALUE} instead",
-                    FutureWarning,
-                )
-                self.accelerator_config.dispatch_batches = self.dispatch_batches
-
-            if self.split_batches is not None:
-                warnings.warn(
-                    "Using `--split_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use"
-                    " `--accelerator_config {'split_batches':VALUE} instead",
-                    FutureWarning,
-                )
-                self.accelerator_config.split_batches = self.split_batches
-
-            if self.dataloader_drop_last:
-                self.accelerator_config.even_batches = False
-
-        if isinstance(self.debug, str):
-            self.debug = [DebugOption(s) for s in self.debug.split()]
-        elif self.debug is None:
-            self.debug = []
-
-        # This call to self.device is necessary to call _setup_devices so that
-        # torch.distributed is initialized
-        device_is_hpu = self.device.type == "hpu"
-        self.deepspeed_plugin = None
-        if self.deepspeed:
-            if not device_is_hpu:
-                raise ValueError("This version of DeepSpeed must be run on HPUs.")
-
-            # - must be run very last in arg parsing, since it will use a lot of these settings.
-            # - must be run before the model is created.
-            if not is_accelerate_available():
-                raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.")
-            from .integrations.deepspeed import GaudiTrainerDeepSpeedConfig
-
-            # will be used later by the Trainer
-            # note: leave self.deepspeed unmodified in case a user relies on it not to be modified)
-            self.hf_deepspeed_config = GaudiTrainerDeepSpeedConfig(self.deepspeed)
-            self.hf_deepspeed_config.trainer_config_process(self)
-
-            # Accelerate DeepSpeed Plugin
-            from accelerate.utils import DeepSpeedPlugin
-
-            os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
-            self.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config)
-        elif strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")):
-            # Accelerate DeepSpeed Plugin
-            from accelerate.utils import DeepSpeedPlugin
-
-            self.deepspeed_plugin = DeepSpeedPlugin()
-            mixed_precision = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
-            self.deepspeed_plugin.set_mixed_precision(mixed_precision)
-            self.deepspeed_plugin.set_deepspeed_weakref()
-
-        if self.use_cpu:
-            self.dataloader_pin_memory = False
-
-        if self.dataloader_num_workers == 0 and self.dataloader_prefetch_factor is not None:
-            raise ValueError(
-                "--dataloader_prefetch_factor can only be set when data is loaded in a different process, i.e."
-                " when --dataloader_num_workers > 1."
-            )
-
-        if self.push_to_hub_token is not None:
-            warnings.warn(
-                (
-                    "`--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use "
-                    "`--hub_token` instead."
-                ),
-                FutureWarning,
-            )
-            self.hub_token = self.push_to_hub_token
-
-        if self.push_to_hub_model_id is not None:
-            self.hub_model_id = get_full_repo_name(
-                self.push_to_hub_model_id, organization=self.push_to_hub_organization, token=self.hub_token
-            )
-            if self.push_to_hub_organization is not None:
-                warnings.warn(
-                    (
-                        "`--push_to_hub_model_id` and `--push_to_hub_organization` are deprecated and will be removed"
-                        " in version 5 of 🤗 Transformers. Use `--hub_model_id` instead and pass the full repo name to"
-                        f" this argument (in this case {self.hub_model_id})."
-                    ),
-                    FutureWarning,
-                )
-            else:
-                warnings.warn(
-                    (
-                        "`--push_to_hub_model_id` is deprecated and will be removed in version 5 of 🤗 Transformers."
-                        " Use `--hub_model_id` instead and pass the full repo name to this argument (in this case"
-                        f" {self.hub_model_id})."
-                    ),
-                    FutureWarning,
-                )
-        elif self.push_to_hub_organization is not None:
-            self.hub_model_id = f"{self.push_to_hub_organization}/{Path(self.output_dir).name}"
-            warnings.warn(
-                (
-                    "`--push_to_hub_organization` is deprecated and will be removed in version 5 of 🤗 Transformers."
-                    " Use `--hub_model_id` instead and pass the full repo name to this argument (in this case"
-                    f" {self.hub_model_id})."
-                ),
-                FutureWarning,
-            )
-
-    def __str__(self):
-        self_as_dict = asdict(self)
-
-        # Remove deprecated arguments. That code should be removed once
-        # those deprecated arguments are removed from TrainingArguments. (TODO: transformers v5)
-        del self_as_dict["per_gpu_train_batch_size"]
-        del self_as_dict["per_gpu_eval_batch_size"]
-        # Remove arguments that are unsupported by optimum-habana
-        for key in UNSUPPORTED_ARGUMENTS:
-            del self_as_dict[key]
-
-        self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()}
-
-        attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
-        return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"
-
-    __repr__ = __str__
-
-    @cached_property
-    def _setup_devices(self) -> "torch.device":
-        requires_backends(self, ["torch"])
-
-        # Hack to make sure bf16/fp32 ops are specified before calling habana_frameworks.torch.core
-        if self.gaudi_config_name is not None:
-            gaudi_config = GaudiConfig.from_pretrained(self.gaudi_config_name)
-            if (
-                (self.bf16 or gaudi_config.use_torch_autocast)
-                and not self.deepspeed
-                and self.half_precision_backend == "hpu_amp"
-            ):
-                gaudi_config.declare_autocast_bf16_fp32_ops()
-
-        logger.info("PyTorch: setting up devices")
-        if not is_accelerate_available():
-            raise ImportError(
-                f"Using the `Trainer` with `PyTorch` requires `accelerate>={ACCELERATE_MIN_VERSION}`: "
-                "Please run `pip install transformers[torch]` or `pip install accelerate -U`"
-            )
-        GaudiAcceleratorState._reset_state()
-        GaudiPartialState._reset_state()
-        self.distributed_state = None
-
-        # Set the log level here for optimum.utils.logging
-        # otherwise logs are not sent in this method.
-        log_level = self.get_process_log_level()
-        logging.set_verbosity(log_level)
-
-        if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
-            os.environ["ACCELERATE_USE_IPEX"] = "false"
-        if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")):
-            self.distributed_state = GaudiPartialState(cpu=True, backend=self.ddp_backend)
-            self._n_gpu = 0
-        elif self.use_habana:
-            # Some methods needs to be tweaked to optimally run on Gaudi
-            # Calling this method here to be sure it is done before model instantiation
-            # Otherwise this will fail when some __init__ methods are overridden (cf. GPT2Attention)
-            from .modeling_utils import adapt_transformers_to_gaudi
-
-            adapt_transformers_to_gaudi()
-
-            if self.use_lazy_mode:
-                logger.info("Enabled lazy mode.")
-            elif not self.torch_compile:
-                if os.getenv("PT_HPU_LAZY_MODE", "1") != "0":
-                    raise ValueError(
-                        "Lazy mode or compile mode not enabled => eager mode should be enabled using PT_HPU_LAZY_MODE=0"
-                    )
-
-            if self.deepspeed:
-                # Need to do similar for Accelerator init
-                os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
-                self.distributed_state = GaudiPartialState(timeout=timedelta(seconds=self.ddp_timeout))
-                del os.environ["ACCELERATE_USE_DEEPSPEED"]
-            else:
-                self.distributed_state = GaudiPartialState(
-                    backend=self.ddp_backend, timeout=timedelta(seconds=self.ddp_timeout)
-                )
-            self._n_gpu = 1
-        else:
-            raise ValueError(
-                "No device has been set. Use either --use_habana to run on HPU or --no_cuda to run on CPU."
-            )
-
-        device = self.distributed_state.device
-        self.local_rank = self.distributed_state.local_process_index
-        if (
-            torch.distributed.is_available()
-            and torch.distributed.is_initialized()
-            and self.parallel_mode != ParallelMode.DISTRIBUTED
-        ):
-            logger.warning(
-                "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
-                "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
-            )
-
-        if self.distributed_state.distributed_type == GaudiDistributedType.NO:
-            self._n_gpu = 0
-
-        return device
diff --git a/optimum/habana/transformers/training_args_seq2seq.py b/optimum/habana/transformers/training_args_seq2seq.py
deleted file mode 100644
index 82e02bb491..0000000000
--- a/optimum/habana/transformers/training_args_seq2seq.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Optional, Union
-
-from optimum.utils import logging
-
-from .generation import GaudiGenerationConfig
-from .training_args import GaudiTrainingArguments
-
-
-logger = logging.get_logger(__name__)
-
-
-@dataclass
-class GaudiSeq2SeqTrainingArguments(GaudiTrainingArguments):
-    """
-    GaudiSeq2SeqTrainingArguments is built on top of the Tranformers' [Seq2SeqTrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments)
-    to enable deployment on Habana's Gaudi.
-
-    Args:
-        sortish_sampler (`bool`, *optional*, defaults to `False`):
-            Whether to use a *sortish sampler* or not. Only possible if the underlying datasets are *Seq2SeqDataset*
-            for now but will become generally available in the near future.
-            It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness
-            for the training set.
-        predict_with_generate (`bool`, *optional*, defaults to `False`):
-            Whether to use generate to calculate generative metrics (ROUGE, BLEU).
-        generation_max_length (`int`, *optional*):
-            The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
-            `max_length` value of the model configuration.
-        generation_num_beams (`int`, *optional*):
-            The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
-            `num_beams` value of the model configuration.
-        generation_config (`str` or `Path` or [`transformers.generation.GenerationConfig`], *optional*):
-            Allows to load a [`transformers.generation.GenerationConfig`] from the `from_pretrained` method. This can be either:
-
-            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co.
-            - a path to a *directory* containing a configuration file saved using the
-              [`transformers.GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
-            - a [`transformers.generation.GenerationConfig`] object.
-    """
-
-    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
-    predict_with_generate: bool = field(
-        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
-    )
-    generation_max_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default "
-                "to the `max_length` value of the model configuration."
-            )
-        },
-    )
-    generation_num_beams: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default "
-                "to the `num_beams` value of the model configuration."
-            )
-        },
-    )
-    generation_config: Optional[Union[str, Path, GaudiGenerationConfig]] = field(
-        default=None,
-        metadata={
-            "help": "Model id, file path or url pointing to a GenerationConfig json file, to use during prediction."
-        },
-    )
-
-    def to_dict(self):
-        """
-        Serializes this instance while replace `Enum` by their values and `GaudiGenerationConfig` by dictionaries (for JSON
-        serialization support). It obfuscates the token values by removing their value.
-        """
-        # filter out fields that are defined as field(init=False)
-        d = super().to_dict()
-        for k, v in d.items():
-            if isinstance(v, GaudiGenerationConfig):
-                d[k] = v.to_dict()
-        return d
diff --git a/optimum/habana/trl/__init__.py b/optimum/habana/trl/__init__.py
deleted file mode 100644
index c35efc5a61..0000000000
--- a/optimum/habana/trl/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .models.modeling_base import adapt_PreTrainedModelWrapper_to_gaudi
-from .models.modeling_sd_base import GaudiDefaultDDPOStableDiffusionPipeline
-from .trainer.ddpo_trainer import GaudiDDPOTrainer
-from .trainer.dpo_trainer import GaudiDPOTrainer
-from .trainer.ppo_config import GaudiPPOConfig
-from .trainer.ppo_trainer import GaudiPPOTrainer
-from .trainer.reward_trainer import GaudiRewardTrainer, RewardDataCollatorWithPadding
-from .trainer.sft_trainer import GaudiSFTTrainer
diff --git a/optimum/habana/trl/models/__init__.py b/optimum/habana/trl/models/__init__.py
deleted file mode 100644
index 5e0e6ee610..0000000000
--- a/optimum/habana/trl/models/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from trl.import_utils import is_diffusers_available
-
-from .modeling_base import adapt_PreTrainedModelWrapper_to_gaudi
-
-
-if is_diffusers_available():
-    from .modeling_sd_base import (
-        GaudiDefaultDDPOStableDiffusionPipeline,
-    )
diff --git a/optimum/habana/trl/models/modeling_base.py b/optimum/habana/trl/models/modeling_base.py
deleted file mode 100644
index fcdc7ddc3f..0000000000
--- a/optimum/habana/trl/models/modeling_base.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-import torch
-from trl import PreTrainedModelWrapper
-
-from optimum.habana.utils import to_device_dtype
-
-
-def adapt_PreTrainedModelWrapper_to_gaudi():
-    PreTrainedModelWrapper._get_current_device = gaudi_get_current_device
-    PreTrainedModelWrapper.save_pretrained = gaudi_save_pretrained
-
-
-def gaudi_get_current_device():
-    """
-    Copied from PreTrainedModelWrapper._get_current_device: https://github.com/huggingface/trl/blob/v0.7.6/trl/models/modeling_base.py#L392
-    - add hpu device
-    """
-    if hasattr(torch, "hpu") and torch.hpu.is_available():
-        return "hpu"
-    else:
-        return "cpu"
-
-
-def gaudi_save_pretrained(self, *args, **kwargs):
-    """
-    Copied from PreTrainedModelWrapper.save_pretrained: https://github.com/huggingface/trl/blob/v0.7.6/trl/models/modeling_base.py#L528
-    - to cpu if model dict is in hpu
-    """
-    state_dict = kwargs.get("state_dict")
-    if state_dict is None:
-        state_dict = self.state_dict()
-        kwargs["state_dict"] = state_dict
-
-    if self.__class__._get_current_device() == "hpu":
-        state_dict = to_device_dtype(state_dict, target_device=torch.device("cpu"))
-
-    # if it is a peft model only save the `v_head` state_dict and
-    # pop the `state_dict` from the kwargs to avoid slient bugs with `peft`
-    if self.is_peft_model:
-        save_path = args[0]
-        save_path = os.path.join(save_path, "pytorch_model.bin")
-        torch.save(state_dict, save_path)
-        _ = kwargs.pop("state_dict", None)
-
-    if self.__class__._get_current_device() == "hpu":
-        state_dict = self.pretrained_model.state_dict()
-        state_dict = to_device_dtype(state_dict, target_device=torch.device("cpu"))
-        kwargs["state_dict"] = state_dict
-
-    return self.pretrained_model.save_pretrained(*args, **kwargs)
diff --git a/optimum/habana/trl/models/modeling_sd_base.py b/optimum/habana/trl/models/modeling_sd_base.py
deleted file mode 100644
index 006f3d77c8..0000000000
--- a/optimum/habana/trl/models/modeling_sd_base.py
+++ /dev/null
@@ -1,379 +0,0 @@
-# Copyright 2023 DDPO-pytorch authors (Kevin Black), The HuggingFace Team, metric-space. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg
-from trl.models import DDPOPipelineOutput, DDPOSchedulerOutput, DefaultDDPOStableDiffusionPipeline
-from trl.models.modeling_sd_base import (
-    _get_variance,
-    _left_broadcast,
-)
-
-from optimum.habana import GaudiConfig
-from optimum.habana.diffusers import (
-    GaudiDDIMScheduler,
-    GaudiStableDiffusionPipeline,
-)
-
-
-def scheduler_step(
-    self,
-    model_output: torch.FloatTensor,
-    timestep: int,
-    sample: torch.FloatTensor,
-    eta: float = 0.0,
-    use_clipped_model_output: bool = False,
-    generator=None,
-    prev_sample: Optional[torch.FloatTensor] = None,
-) -> DDPOSchedulerOutput:
-    """
-    Adapted from: https://github.com/huggingface/trl/blob/v0.7.8/trl/models/modeling_sd_base.py#L187
-        - Changed random number generation for HPU
-    """
-
-    if self.num_inference_steps is None:
-        raise ValueError(
-            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-        )
-
-    # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
-    # Ideally, read DDIM paper in-detail understanding
-
-    # Notation (<variable name> -> <name in paper>
-    # - pred_noise_t -> e_theta(x_t, t)
-    # - pred_original_sample -> f_theta(x_t, t) or x_0
-    # - std_dev_t -> sigma_t
-    # - eta -> η
-    # - pred_sample_direction -> "direction pointing to x_t"
-    # - pred_prev_sample -> "x_t-1"
-
-    # 1. get previous step value (=t-1)
-    prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
-    # to prevent OOB on gather
-    prev_timestep = torch.clamp(prev_timestep, 0, self.config.num_train_timesteps - 1)
-
-    # 2. compute alphas, betas
-    alpha_prod_t = self.alphas_cumprod.gather(0, timestep.cpu())
-    alpha_prod_t_prev = torch.where(
-        prev_timestep.cpu() >= 0,
-        self.alphas_cumprod.gather(0, prev_timestep.cpu()),
-        self.final_alpha_cumprod,
-    )
-    alpha_prod_t = _left_broadcast(alpha_prod_t, sample.shape).to(sample.device)
-    alpha_prod_t_prev = _left_broadcast(alpha_prod_t_prev, sample.shape).to(sample.device)
-
-    beta_prod_t = 1 - alpha_prod_t
-
-    # 3. compute predicted original sample from predicted noise also called
-    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    if self.config.prediction_type == "epsilon":
-        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        pred_epsilon = model_output
-    elif self.config.prediction_type == "sample":
-        pred_original_sample = model_output
-        pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
-    elif self.config.prediction_type == "v_prediction":
-        pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-        pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
-    else:
-        raise ValueError(
-            f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-            " `v_prediction`"
-        )
-
-    # 4. Clip or threshold "predicted x_0"
-    if self.config.thresholding:
-        pred_original_sample = self._threshold_sample(pred_original_sample)
-    elif self.config.clip_sample:
-        pred_original_sample = pred_original_sample.clamp(
-            -self.config.clip_sample_range, self.config.clip_sample_range
-        )
-
-    # 5. compute variance: "sigma_t(η)" -> see formula (16)
-    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
-    variance = _get_variance(self, timestep, prev_timestep)
-    std_dev_t = eta * variance ** (0.5)
-    std_dev_t = _left_broadcast(std_dev_t, sample.shape).to(sample.device)
-
-    if use_clipped_model_output:
-        # the pred_epsilon is always re-derived from the clipped x_0 in Glide
-        pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
-
-    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
-
-    # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    prev_sample_mean = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
-
-    if prev_sample is not None and generator is not None:
-        raise ValueError(
-            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
-            " `prev_sample` stays `None`."
-        )
-
-    if prev_sample is None:
-        # torch.randn is broken on HPU so running it on CPU
-        rand_device = "cpu" if model_output.type == "hpu" else model_output.device
-        variance_noise = torch.randn(
-            model_output.shape,
-            generator=generator,
-            device=rand_device,
-            dtype=model_output.dtype,
-        ).to(model_output.device)
-        prev_sample = prev_sample_mean + std_dev_t * variance_noise
-
-    # log prob of prev_sample given prev_sample_mean and std_dev_t
-    log_prob = (
-        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std_dev_t**2))
-        - torch.log(std_dev_t)
-        - torch.log(torch.sqrt(2 * torch.as_tensor(np.pi, device=model_output.device)))
-    )
-    # mean along all but batch dimension
-    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
-
-    return DDPOSchedulerOutput(prev_sample.type(sample.dtype), log_prob)
-
-
-# 1. The output type for call is different as the logprobs are now returned
-# 2. An extra method called `scheduler_step` is added which is used to constraint the scheduler output
-@torch.no_grad()
-def pipeline_step(
-    self,
-    prompt: Optional[Union[str, List[str]]] = None,
-    height: Optional[int] = None,
-    width: Optional[int] = None,
-    num_inference_steps: int = 50,
-    guidance_scale: float = 7.5,
-    negative_prompt: Optional[Union[str, List[str]]] = None,
-    num_images_per_prompt: Optional[int] = 1,
-    eta: float = 0.0,
-    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-    latents: Optional[torch.FloatTensor] = None,
-    prompt_embeds: Optional[torch.FloatTensor] = None,
-    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    output_type: Optional[str] = "pil",
-    return_dict: bool = True,
-    callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-    callback_steps: int = 1,
-    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    guidance_rescale: float = 0.0,
-):
-    r"""
-    Adapted from: https://github.com/huggingface/trl/blob/v0.7.8/trl/models/modeling_sd_base.py#L325
-        - Add `mark_step()`
-        - Added support for HPU graphs
-        - Reset time-dependent variables in Gaudi scheduler
-    """
-    # 0. Default height and width to unet
-    height = height or self.unet.config.sample_size * self.vae_scale_factor
-    width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-    # 1. Check inputs. Raise error if not correct
-    self.check_inputs(
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt,
-        prompt_embeds,
-        negative_prompt_embeds,
-    )
-
-    # 2. Define call parameters
-    if prompt is not None and isinstance(prompt, str):
-        batch_size = 1
-    elif prompt is not None and isinstance(prompt, list):
-        batch_size = len(prompt)
-    else:
-        batch_size = prompt_embeds.shape[0]
-
-    device = self._execution_device
-    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-    # corresponds to doing no classifier free guidance.
-    do_classifier_free_guidance = guidance_scale > 1.0
-
-    # 3. Encode input prompt
-    text_encoder_lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-    prompt_embeds = self._encode_prompt(
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt,
-        prompt_embeds=prompt_embeds,
-        negative_prompt_embeds=negative_prompt_embeds,
-        lora_scale=text_encoder_lora_scale,
-    )
-
-    # 4. Prepare timesteps
-    self.scheduler.set_timesteps(num_inference_steps, device="cpu")
-    timesteps = self.scheduler.timesteps.to(self.device)
-    self.scheduler.reset_timestep_dependent_params()
-
-    # 5. Prepare latent variables
-    num_channels_latents = self.unet.config.in_channels
-    latents = self.prepare_latents(
-        batch_size * num_images_per_prompt,
-        num_channels_latents,
-        height,
-        width,
-        prompt_embeds.dtype,
-        device,
-        generator,
-        latents,
-    )
-
-    # 6. Denoising loop
-    num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-    all_latents = [latents]
-    all_log_probs = []
-
-    with self.progress_bar(total=num_inference_steps) as progress_bar:
-        for i in range(num_inference_steps):
-            t = timesteps[0]
-            timesteps = torch.roll(timesteps, shifts=-1, dims=0)
-
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet_hpu(
-                latent_model_input,
-                t,
-                encoder_hidden_states=prompt_embeds,
-                cross_attention_kwargs=cross_attention_kwargs,
-                timestep_cond=None,
-                added_cond_kwargs=None,
-            )
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            if do_classifier_free_guidance and guidance_rescale > 0.0:
-                # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = scheduler_step(self.scheduler, noise_pred, t, latents, eta)
-            if self.use_habana and not self.use_hpu_graphs:
-                self.htcore.mark_step()
-
-            latents = scheduler_output.latents
-            log_prob = scheduler_output.log_probs
-
-            all_latents.append(latents)
-            all_log_probs.append(log_prob)
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                progress_bar.update()
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-    if not output_type == "latent":
-        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-    else:
-        image = latents
-        has_nsfw_concept = None
-
-    if self.use_habana and not self.use_hpu_graphs:
-        self.htcore.mark_step()
-
-    if has_nsfw_concept is None:
-        do_denormalize = [True] * image.shape[0]
-    else:
-        do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-    image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-    # Offload last model to CPU
-    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-        self.final_offload_hook.offload()
-
-    return DDPOPipelineOutput(image, all_latents, all_log_probs)
-
-
-class GaudiDefaultDDPOStableDiffusionPipeline(DefaultDDPOStableDiffusionPipeline):
-    def __init__(
-        self,
-        pretrained_model_name: str,
-        *,
-        pretrained_model_revision: str = "main",
-        use_lora: bool = True,
-        use_habana: bool = False,
-        use_hpu_graphs: bool = False,
-        gaudi_config: Union[str, GaudiConfig] = None,
-        bf16_full_eval: bool = False,
-    ):
-        """
-        Adapted from: https://github.com/huggingface/trl/blob/v0.7.8/trl/models/modeling_sd_base.py#L531
-        - use GaudiStableDiffusionPipeline instead of StableDiffusionPipeline
-        - use GaudiDDIMScheduler instead of DDIMScheduler
-        - support bf16.
-        """
-        self.sd_pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            pretrained_model_name,
-            revision=pretrained_model_revision,
-            use_habana=use_habana,
-            use_hpu_graphs=use_hpu_graphs,
-            gaudi_config=gaudi_config,
-            torch_dtype=torch.bfloat16 if bf16_full_eval else torch.float32,
-        )
-
-        self.use_lora = use_lora
-        self.pretrained_model = pretrained_model_name
-        self.pretrained_revision = pretrained_model_revision
-        self.use_habana = use_habana
-        self.use_hpu_graphs = use_hpu_graphs
-        self.gaudi_config = gaudi_config
-
-        try:
-            self.sd_pipeline.load_lora_weights(
-                pretrained_model_name,
-                weight_name="pytorch_lora_weights.safetensors",
-                revision=pretrained_model_revision,
-            )
-            self.use_lora = True
-        except OSError:
-            if use_lora:
-                warnings.warn(
-                    "If you are aware that the pretrained model has no lora weights to it, ignore this message. "
-                    "Otherwise please check the if `pytorch_lora_weights.safetensors` exists in the model folder."
-                )
-
-        self.sd_pipeline.scheduler = GaudiDDIMScheduler.from_config(self.sd_pipeline.scheduler.config)
-        self.sd_pipeline.safety_checker = None
-
-        # memory optimization
-        self.sd_pipeline.vae.requires_grad_(False)
-        self.sd_pipeline.text_encoder.requires_grad_(False)
-        self.sd_pipeline.unet.requires_grad_(not self.use_lora)
-
-    def __call__(self, *args, **kwargs) -> DDPOPipelineOutput:
-        return pipeline_step(self.sd_pipeline, *args, **kwargs)
-
-    @property
-    def unet_hpu(self):
-        if self.use_habana:
-            return self.sd_pipeline.unet_hpu
-        else:
-            return self.sd_pipeline.unet
diff --git a/optimum/habana/trl/trainer/__init__.py b/optimum/habana/trl/trainer/__init__.py
deleted file mode 100644
index e8a164ee58..0000000000
--- a/optimum/habana/trl/trainer/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# flake8: noqa
-
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# There is a circular import in the PPOTrainer if we let isort sort these
-# isort: on
-
-from .sft_trainer import GaudiSFTTrainer
-from .dpo_trainer import GaudiDPOTrainer
-from .ppo_config import GaudiPPOConfig
-from .ppo_trainer import GaudiPPOTrainer
-from .reward_trainer import GaudiRewardTrainer, RewardDataCollatorWithPadding
-
-from .ddpo_trainer import GaudiDDPOTrainer
diff --git a/optimum/habana/trl/trainer/ddpo_trainer.py b/optimum/habana/trl/trainer/ddpo_trainer.py
deleted file mode 100644
index 73c6b725e1..0000000000
--- a/optimum/habana/trl/trainer/ddpo_trainer.py
+++ /dev/null
@@ -1,522 +0,0 @@
-# Copyright 2023 DDPO-pytorch authors (Kevin Black), metric-space, The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from collections import defaultdict
-from concurrent import futures
-from typing import Any, Callable, Optional, Tuple
-from warnings import warn
-
-import torch
-from accelerate.logging import get_logger
-from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration
-from tqdm.auto import tqdm
-from trl import DDPOTrainer
-from trl.models import DDPOStableDiffusionPipeline
-from trl.trainer import DDPOConfig
-from trl.trainer.utils import PerPromptStatTracker
-
-from optimum.habana import GaudiConfig
-from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.utils import set_seed
-
-
-logger = get_logger(__name__)
-
-
-class GaudiDDPOTrainer(DDPOTrainer):
-    def __init__(
-        self,
-        config: DDPOConfig,
-        reward_function: Callable[[torch.Tensor, Tuple[str], Tuple[Any]], torch.Tensor],
-        prompt_function: Callable[[], Tuple[str, Any]],
-        sd_pipeline: DDPOStableDiffusionPipeline,
-        image_samples_hook: Optional[Callable[[Any, Any, Any], Any]] = None,
-        gaudi_config: GaudiConfig = None,
-        use_habana: bool = True,
-        use_hpu_graphs: bool = False,
-    ):
-        """
-        Adapted from DDPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.8/trl/trainer/ddpo_trainer.py#L72
-        The changes are:
-        - add new args gaudi_config
-        - support HPU graphs for trainable layers
-        - use GaudiAccelerator instead of Accelerator
-        - support FusedClipNorm
-        """
-        if image_samples_hook is None:
-            warn("No image_samples_hook provided; no images will be logged")
-
-        self.prompt_fn = prompt_function
-        self.reward_fn = reward_function
-        self.config = config
-        self.image_samples_callback = image_samples_hook
-        self.gaudi_config = gaudi_config
-        self.use_hpu_graphs = use_hpu_graphs
-        self.use_habana = use_habana
-
-        if use_habana:
-            try:
-                import habana_frameworks.torch.core as htcore
-            except ImportError as error:
-                error.msg = f"Could not import habana_frameworks.torch.core. {error.msg}."
-                raise error
-            self.htcore = htcore
-
-        accelerator_project_config = ProjectConfiguration(**self.config.project_kwargs)
-
-        if self.config.resume_from:
-            self.config.resume_from = os.path.normpath(os.path.expanduser(self.config.resume_from))
-            if "checkpoint_" not in os.path.basename(self.config.resume_from):
-                # get the most recent checkpoint in this directory
-                checkpoints = list(
-                    filter(
-                        lambda x: "checkpoint_" in x,
-                        os.listdir(self.config.resume_from),
-                    )
-                )
-                if len(checkpoints) == 0:
-                    raise ValueError(f"No checkpoints found in {self.config.resume_from}")
-                checkpoint_numbers = sorted([int(x.split("_")[-1]) for x in checkpoints])
-                self.config.resume_from = os.path.join(
-                    self.config.resume_from,
-                    f"checkpoint_{checkpoint_numbers[-1]}",
-                )
-
-                accelerator_project_config.iteration = checkpoint_numbers[-1] + 1
-
-        # number of timesteps within each trajectory to train on
-        self.num_train_timesteps = int(self.config.sample_num_steps * self.config.train_timestep_fraction)
-        kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
-        self.accelerator = GaudiAccelerator(
-            log_with=self.config.log_with,
-            mixed_precision="bf16" if config.mixed_precision == "bf16" else "no",
-            project_config=accelerator_project_config,
-            cpu=(not use_habana),
-            kwargs_handlers=[kwargs],
-            # we always accumulate gradients across timesteps; we want config.train.gradient_accumulation_steps to be the
-            # number of *samples* we accumulate across, so we need to multiply by the number of training timesteps to get
-            # the total number of optimizer steps to accumulate across.
-            gradient_accumulation_steps=self.config.train_gradient_accumulation_steps * self.num_train_timesteps,
-            **self.config.accelerator_kwargs,
-        )
-
-        is_okay, message = self._config_check()
-        if not is_okay:
-            raise ValueError(message)
-
-        is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard"
-
-        if self.accelerator.is_main_process:
-            self.accelerator.init_trackers(
-                self.config.tracker_project_name,
-                config={"ddpo_trainer_config": config.to_dict()} if not is_using_tensorboard else config.to_dict(),
-                init_kwargs=self.config.tracker_kwargs,
-            )
-
-        logger.info(f"\n{config}")
-
-        set_seed(self.config.seed)
-
-        self.sd_pipeline = sd_pipeline
-
-        self.sd_pipeline.set_progress_bar_config(
-            position=1,
-            disable=not self.accelerator.is_local_main_process,
-            leave=False,
-            desc="Timestep",
-            dynamic_ncols=True,
-        )
-
-        # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
-        # as these weights are only used for inference, keeping weights in full precision is not required.
-        if self.accelerator.mixed_precision == "bf16":
-            inference_dtype = torch.bfloat16
-        else:
-            inference_dtype = torch.float32
-
-        self.sd_pipeline.vae.to(self.accelerator.device, dtype=inference_dtype)
-        self.sd_pipeline.text_encoder.to(self.accelerator.device, dtype=inference_dtype)
-        self.sd_pipeline.unet.to(self.accelerator.device, dtype=inference_dtype)
-
-        trainable_layers = self.sd_pipeline.get_trainable_layers()
-
-        self.accelerator.register_save_state_pre_hook(self._save_model_hook)
-        self.accelerator.register_load_state_pre_hook(self._load_model_hook)
-
-        self.optimizer = self._setup_optimizer(
-            trainable_layers.parameters() if not isinstance(trainable_layers, list) else trainable_layers
-        )
-
-        self.neg_prompt_embed = self.sd_pipeline.text_encoder(
-            self.sd_pipeline.tokenizer(
-                [""] if self.config.negative_prompts is None else self.config.negative_prompts,
-                return_tensors="pt",
-                padding="max_length",
-                truncation=True,
-                max_length=self.sd_pipeline.tokenizer.model_max_length,
-            ).input_ids.to(self.accelerator.device)
-        )[0]
-
-        if config.per_prompt_stat_tracking:
-            self.stat_tracker = PerPromptStatTracker(
-                config.per_prompt_stat_tracking_buffer_size,
-                config.per_prompt_stat_tracking_min_count,
-            )
-
-        if hasattr(self.sd_pipeline, "use_lora") and self.sd_pipeline.use_lora:
-            unet, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer)
-            self.trainable_layers = list(filter(lambda p: p.requires_grad, unet.parameters()))
-        else:
-            self.trainable_layers, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer)
-
-        if self.gaudi_config.use_fused_clip_norm:
-            try:
-                from habana_frameworks.torch.hpex.normalization import FusedClipNorm
-            except ImportError as error:
-                error.msg = (
-                    f"Could not import 'FusedClipNorm' from 'habana_frameworks.torch.hpex.normalization'. {error.msg}."
-                )
-                raise error
-            self.FusedNorm = FusedClipNorm(
-                self.trainable_layers.parameters()
-                if not isinstance(self.trainable_layers, list)
-                else self.trainable_layers,
-                self.config.train_max_grad_norm,
-            )
-
-        if use_hpu_graphs:
-            import habana_frameworks.torch as ht
-
-            ht.hpu.ModuleCacher()(model=self.sd_pipeline.unet, inplace=True)
-
-        if self.config.async_reward_computation:
-            self.executor = futures.ThreadPoolExecutor(max_workers=config.max_workers)
-
-        if config.resume_from:
-            logger.info(f"Resuming from {config.resume_from}")
-            self.accelerator.load_state(config.resume_from)
-            self.first_epoch = int(config.resume_from.split("_")[-1]) + 1
-        else:
-            self.first_epoch = 0
-
-    def step(self, epoch: int, global_step: int):
-        """
-        Adapted from https://github.com/huggingface/trl/blob/v0.7.8/trl/trainer/ddpo_trainer.py#L234
-        - Add progress bar to track training epochs
-        - Convert bfloat to float when creating to numpy arrays
-        """
-        samples, prompt_image_data = self._generate_samples(
-            iterations=self.config.sample_num_batches_per_epoch,
-            batch_size=self.config.sample_batch_size,
-        )
-
-        # collate samples into dict where each entry has shape (num_batches_per_epoch * sample.batch_size, ...)
-        samples = {k: torch.cat([s[k] for s in samples]) for k in samples[0].keys()}
-
-        rewards, rewards_metadata = self.compute_rewards(
-            prompt_image_data, is_async=self.config.async_reward_computation
-        )
-
-        for i, image_data in enumerate(prompt_image_data):
-            image_data.extend([rewards[i], rewards_metadata[i]])
-
-        if self.image_samples_callback is not None:
-            self.image_samples_callback(prompt_image_data, global_step, self.accelerator.trackers[0])
-
-        rewards = torch.cat(rewards)
-
-        if rewards.dtype == torch.bfloat16:
-            rewards = rewards.float()  # bf16 not supported by numpy
-
-        rewards = self.accelerator.gather(rewards).cpu().numpy()
-
-        self.accelerator.log(
-            {
-                "reward": rewards,
-                "epoch": epoch,
-                "reward_mean": rewards.mean(),
-                "reward_std": rewards.std(),
-            },
-            step=global_step,
-        )
-
-        if self.config.per_prompt_stat_tracking:
-            # gather the prompts across processes
-            prompt_ids = self.accelerator.gather(samples["prompt_ids"]).cpu().numpy()
-            prompts = self.sd_pipeline.tokenizer.batch_decode(prompt_ids, skip_special_tokens=True)
-            advantages = self.stat_tracker.update(prompts, rewards)
-        else:
-            advantages = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
-
-        # ungather advantages;  keep the entries corresponding to the samples on this process
-        samples["advantages"] = (
-            torch.as_tensor(advantages)
-            .reshape(self.accelerator.num_processes, -1)[self.accelerator.process_index]
-            .to(self.accelerator.device)
-        )
-
-        del samples["prompt_ids"]
-
-        total_batch_size, num_timesteps = samples["timesteps"].shape
-
-        pbar = tqdm(
-            range(self.config.train_num_inner_epochs),
-            desc=f"Epoch {epoch}",
-            disable=not self.accelerator.is_main_process,
-        )
-        for inner_epoch in pbar:
-            # shuffle samples along batch dimension
-            perm = torch.randperm(total_batch_size, device=self.accelerator.device)
-            samples = {k: v[perm] for k, v in samples.items()}
-
-            # shuffle along time dimension independently for each sample
-            # still trying to understand the code below
-            perms = torch.stack(
-                [torch.randperm(num_timesteps, device=self.accelerator.device) for _ in range(total_batch_size)]
-            )
-
-            for key in ["timesteps", "latents", "next_latents", "log_probs"]:
-                samples[key] = samples[key][
-                    torch.arange(total_batch_size, device=self.accelerator.device)[:, None],
-                    perms,
-                ]
-
-            original_keys = samples.keys()
-            original_values = samples.values()
-            # rebatch them as user defined train_batch_size is different from sample_batch_size
-            reshaped_values = [v.reshape(-1, self.config.train_batch_size, *v.shape[1:]) for v in original_values]
-
-            # Transpose the list of original values
-            transposed_values = zip(*reshaped_values)
-            # Create new dictionaries for each row of transposed values
-            samples_batched = [dict(zip(original_keys, row_values)) for row_values in transposed_values]
-
-            self.sd_pipeline.unet.train()
-            global_step = self._train_batched_samples(inner_epoch, epoch, global_step, samples_batched)
-            # ensure optimization step at the end of the inner epoch
-            if not self.accelerator.sync_gradients:
-                raise ValueError(
-                    "Optimization step should have been performed by this point. Please check calculated gradient accumulation settings."
-                )
-
-        if epoch != 0 and epoch % self.config.save_freq == 0 and self.accelerator.is_main_process:
-            self.accelerator.save_state()
-
-        return global_step
-
-    def calculate_loss(self, latents, timesteps, next_latents, log_probs, advantages, embeds):
-        """
-        Adapted from https://github.com/huggingface/trl/blob/v0.7.8/trl/trainer/ddpo_trainer.py#L340
-        - Use accelerator autocast (original TRL implemenation uses nullcontext for LoRA training)
-        - Convert logprob to float for loss calculation
-        """
-        with self.accelerator.autocast():
-            if self.config.train_cfg:
-                noise_pred = self.sd_pipeline.unet(
-                    torch.cat([latents] * 2),
-                    torch.cat([timesteps] * 2),
-                    embeds,
-                ).sample
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + self.config.sample_guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
-            else:
-                noise_pred = self.sd_pipeline.unet(
-                    latents,
-                    timesteps,
-                    embeds,
-                ).sample
-            # compute the log prob of next_latents given latents under the current model
-
-            scheduler_step_output = self.sd_pipeline.scheduler_step(
-                noise_pred,
-                timesteps,
-                latents,
-                eta=self.config.sample_eta,
-                prev_sample=next_latents,
-            )
-
-            log_prob = scheduler_step_output.log_probs.float()
-
-        log_probs = log_probs.float()
-        advantages = torch.clamp(
-            advantages.float(),
-            -self.config.train_adv_clip_max,
-            self.config.train_adv_clip_max,
-        )
-
-        ratio = torch.exp(log_prob - log_probs)
-
-        loss = self.loss(advantages, self.config.train_clip_range, ratio)
-
-        approx_kl = 0.5 * torch.mean((log_prob - log_probs) ** 2)
-
-        clipfrac = torch.mean((torch.abs(ratio - 1.0) > self.config.train_clip_range).float())
-
-        return loss, approx_kl, clipfrac
-
-    def _setup_optimizer(self, trainable_layers_parameters):
-        # Adapted from https://github.com/huggingface/trl/blob/v0.7.8/trl/trainer/ddpo_trainer.py#L422
-        # Adds support for FusedAdamW
-        if self.use_habana and self.gaudi_config.use_fused_adam:
-            from habana_frameworks.torch.hpex.optimizers import FusedAdamW
-
-            optimizer_cls = FusedAdamW
-        else:
-            optimizer_cls = torch.optim.AdamW
-
-        return optimizer_cls(
-            trainable_layers_parameters,
-            lr=self.config.train_learning_rate,
-            betas=(self.config.train_adam_beta1, self.config.train_adam_beta2),
-            weight_decay=self.config.train_adam_weight_decay,
-            eps=self.config.train_adam_epsilon,
-        )
-
-    def _generate_samples(self, iterations, batch_size):
-        """
-        Adapted from https://github.com/huggingface/trl/blob/v0.7.8/trl/trainer/ddpo_trainer.py#L446
-        - Load timesteps to HPU
-        """
-        samples = []
-        prompt_image_pairs = []
-        self.sd_pipeline.unet.eval()
-
-        sample_neg_prompt_embeds = self.neg_prompt_embed.repeat(batch_size, 1, 1)
-
-        for i in range(iterations):
-            prompts, prompt_metadata = zip(*[self.prompt_fn() for _ in range(batch_size)])
-
-            prompt_ids = self.sd_pipeline.tokenizer(
-                prompts,
-                return_tensors="pt",
-                padding="max_length",
-                truncation=True,
-                max_length=self.sd_pipeline.tokenizer.model_max_length,
-            ).input_ids.to(self.accelerator.device)
-            prompt_embeds = self.sd_pipeline.text_encoder(prompt_ids)[0]
-
-            with self.accelerator.autocast():
-                sd_output = self.sd_pipeline(
-                    prompt_embeds=prompt_embeds,
-                    negative_prompt_embeds=sample_neg_prompt_embeds,
-                    num_inference_steps=self.config.sample_num_steps,
-                    guidance_scale=self.config.sample_guidance_scale,
-                    eta=self.config.sample_eta,
-                    output_type="pt",
-                )
-
-                images = sd_output.images
-                latents = sd_output.latents
-                log_probs = sd_output.log_probs
-
-            latents = torch.stack(latents, dim=1)  # (batch_size, num_steps + 1, ...)
-            log_probs = torch.stack(log_probs, dim=1)  # (batch_size, num_steps, 1)
-            timesteps = self.sd_pipeline.scheduler.timesteps.repeat(batch_size, 1)
-            timesteps = timesteps.to(latents.device)
-
-            samples.append(
-                {
-                    "prompt_ids": prompt_ids,
-                    "prompt_embeds": prompt_embeds,
-                    "timesteps": timesteps,
-                    "latents": latents[:, :-1],  # each entry is the latent before timestep t
-                    "next_latents": latents[:, 1:],  # each entry is the latent after timestep t
-                    "log_probs": log_probs,
-                    "negative_prompt_embeds": sample_neg_prompt_embeds,
-                }
-            )
-            prompt_image_pairs.append([images, prompts, prompt_metadata])
-
-        return samples, prompt_image_pairs
-
-    def _train_batched_samples(self, inner_epoch, epoch, global_step, batched_samples):
-        """
-        Adapted from https://github.com/huggingface/trl/blob/v0.7.8/trl/trainer/ddpo_trainer.py#L508
-        - Reduce recompilations by avoiding constant variables in loops
-        - Add `mark_step()` to support lazy mode
-        """
-        info = defaultdict(list)
-
-        for _i, sample in enumerate(batched_samples):
-            if self.config.train_cfg:
-                # concat negative prompts to sample prompts to avoid two forward passes
-                embeds = torch.cat([sample["negative_prompt_embeds"], sample["prompt_embeds"]])
-            else:
-                embeds = sample["prompt_embeds"]
-
-            latents = sample["latents"]
-            timesteps = sample["timesteps"]
-            next_latents = sample["next_latents"]
-            log_probs = sample["log_probs"]
-
-            for j in range(self.num_train_timesteps):  # , desc=f"Epoch{i}"):
-                with self.accelerator.accumulate(self.sd_pipeline.unet):
-                    # Reduce recompilations by avoiding constant variables in loops
-                    latent = latents[:, 0]
-                    timestep = timesteps[:, 0]
-                    next_latent = next_latents[:, 0]
-                    log_prob = log_probs[:, 0]
-                    latents = torch.roll(latents, shifts=-1, dims=1)
-                    timesteps = torch.roll(timesteps, shifts=-1, dims=1)
-                    next_latents = torch.roll(next_latents, shifts=-1, dims=1)
-                    log_probs = torch.roll(log_probs, shifts=-1, dims=1)
-
-                    loss, approx_kl, clipfrac = self.calculate_loss(
-                        latent,
-                        timestep,
-                        next_latent,
-                        log_prob,
-                        sample["advantages"],
-                        embeds,
-                    )
-
-                    info["approx_kl"].append(approx_kl)
-                    info["clipfrac"].append(clipfrac)
-                    info["loss"].append(loss)
-                    self.accelerator.backward(loss)
-                    if self.use_habana:
-                        self.htcore.mark_step()
-
-                    if self.accelerator.sync_gradients:
-                        trainable_layers = (
-                            self.trainable_layers.parameters()
-                            if not isinstance(self.trainable_layers, list)
-                            else self.trainable_layers
-                        )
-                        if self.gaudi_config.use_fused_clip_norm:
-                            self.FusedNorm.clip_norm(trainable_layers)
-                        else:
-                            self.self.accelerator.clip_grad_norm_(
-                                trainable_layers,
-                                self.config.train_max_grad_norm,
-                            )
-                    self.optimizer.step()
-                    self.optimizer.zero_grad()
-                    if self.use_habana:
-                        self.htcore.mark_step()
-
-                # Checks if the accelerator has performed an optimization step behind the scenes
-                if self.accelerator.sync_gradients:
-                    # log training-related stuff
-                    info = {k: torch.mean(torch.stack(v)) for k, v in info.items()}
-                    info = self.accelerator.reduce(info, reduction="mean")
-                    info.update({"epoch": epoch, "inner_epoch": inner_epoch})
-                    self.accelerator.log(info, step=global_step)
-                    global_step += 1
-                    info = defaultdict(list)
-
-        return global_step
diff --git a/optimum/habana/trl/trainer/dpo_trainer.py b/optimum/habana/trl/trainer/dpo_trainer.py
deleted file mode 100644
index 1d74d7e331..0000000000
--- a/optimum/habana/trl/trainer/dpo_trainer.py
+++ /dev/null
@@ -1,436 +0,0 @@
-# DPO Authors: Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn 2023
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import warnings
-from collections import defaultdict
-from typing import Callable, Dict, List, Literal, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from accelerate.utils import is_deepspeed_available
-from datasets import Dataset
-from transformers import (
-    AutoModelForCausalLM,
-    DataCollator,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-)
-from transformers.trainer_callback import TrainerCallback
-from transformers.trainer_utils import EvalLoopOutput
-from trl import DPOTrainer, create_reference_model
-from trl.import_utils import is_peft_available, is_wandb_available
-from trl.trainer.utils import (
-    DPODataCollatorWithPadding,
-    disable_dropout_in_model,
-    pad_to_length,
-)
-
-from ... import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-
-
-if is_peft_available():
-    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
-
-
-if is_wandb_available():
-    pass
-
-if is_deepspeed_available():
-    pass
-
-
-class GaudiDPOTrainer(DPOTrainer, GaudiTrainer):
-    def __init__(
-        self,
-        model: Union[PreTrainedModel, nn.Module, str] = None,
-        ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
-        beta: float = 0.1,
-        label_smoothing: float = 0,
-        loss_type: Literal["sigmoid", "hinge", "ipo", "kto"] = "sigmoid",
-        args: GaudiTrainingArguments = None,
-        gaudi_config: GaudiConfig = None,
-        data_collator: Optional[DataCollator] = None,
-        label_pad_token_id: int = -100,
-        padding_value: int = None,
-        truncation_mode: str = "keep_end",
-        train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-        model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        max_length: Optional[int] = None,
-        max_prompt_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        peft_config: Optional[Dict] = None,
-        is_encoder_decoder: Optional[bool] = None,
-        disable_dropout: bool = True,
-        generate_during_eval: bool = False,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
-        precompute_ref_log_probs: bool = False,
-        model_init_kwargs: Optional[Dict] = None,
-        ref_model_init_kwargs: Optional[Dict] = None,
-        model_adapter_name: Optional[str] = None,
-        ref_adapter_name: Optional[str] = None,
-        reference_free: bool = False,
-        force_use_ref_model: bool = False,
-    ):
-        """
-        Copied from DPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/dpo_trainer.py#L127
-        The only differences are:
-        - add new args gaudi_config
-        - use graph for ref_model
-        - use GaudiTrainer instead of Trainer
-        - cast peft model to bf16.
-        """
-        if model_init_kwargs is None:
-            model_init_kwargs = {}
-        elif not isinstance(model, str):
-            raise ValueError("You passed model_kwargs to the DPOTrainer. But your model is already instantiated.")
-
-        if ref_model_init_kwargs is None:
-            ref_model_init_kwargs = {}
-        elif not isinstance(ref_model, str):
-            raise ValueError(
-                "You passed ref_model_kwargs to the DPOTrainer. But your ref_model is already instantiated."
-            )
-
-        if isinstance(model, str):
-            warnings.warn(
-                "You passed a model_id to the DPOTrainer. This will automatically create an "
-                "`AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you."
-            )
-            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
-
-        if isinstance(ref_model, str):
-            warnings.warn(
-                "You passed a ref model_id to the DPOTrainer. This will automatically create an "
-                "`AutoModelForCausalLM`"
-            )
-            ref_model = AutoModelForCausalLM.from_pretrained(ref_model, **ref_model_init_kwargs)
-
-        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
-        # has been called in order to properly call autocast if needed.
-        self._peft_has_been_casted_to_bf16 = False
-
-        if not is_peft_available() and peft_config is not None:
-            raise ValueError(
-                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
-            )
-        elif is_peft_available() and peft_config is not None:
-            # if model is a peft model and we have a peft_config, we merge and unload it first
-            if isinstance(model, PeftModel):
-                model = model.merge_and_unload()
-
-            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
-                _support_gc_kwargs = hasattr(
-                    args, "gradient_checkpointing_kwargs"
-                ) and "gradient_checkpointing_kwargs" in list(
-                    inspect.signature(prepare_model_for_kbit_training).parameters
-                )
-
-                preprare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
-
-                if _support_gc_kwargs:
-                    preprare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
-
-                model = prepare_model_for_kbit_training(model, **preprare_model_kwargs)
-            elif getattr(args, "gradient_checkpointing", False):
-                # For backward compatibility with older versions of transformers
-                if hasattr(model, "enable_input_require_grads"):
-                    model.enable_input_require_grads()
-                else:
-
-                    def make_inputs_require_grad(module, input, output):
-                        output.requires_grad_(True)
-
-                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-            # get peft model with the given config
-            model = get_peft_model(model, peft_config)
-            if args.bf16:
-                model = model.to(torch.bfloat16)
-
-        # For models that use gradient_checkpoiting, we need to attach a hook that enables input
-        # to explicitly have `requires_grad=True`, otherwise training will either silently
-        # fail or completely fail.
-        elif getattr(args, "gradient_checkpointing", False):
-            # For backward compatibility with older versions of transformers
-            if hasattr(model, "enable_input_require_grads"):
-                model.enable_input_require_grads()
-            else:
-
-                def make_inputs_require_grad(module, input, output):
-                    output.requires_grad_(True)
-
-                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-        if generate_during_eval and not is_wandb_available():
-            raise ValueError(
-                "`generate_during_eval=True` requires Weights and Biases to be installed."
-                " Please install `wandb` to resolve."
-            )
-
-        if model is not None:
-            self.is_encoder_decoder = model.config.is_encoder_decoder
-        elif is_encoder_decoder is None:
-            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
-        else:
-            self.is_encoder_decoder = is_encoder_decoder
-
-        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
-        self.model_adapter_name = model_adapter_name
-        self.ref_adapter_name = ref_adapter_name
-        self.reference_free = reference_free
-
-        if ref_model:
-            self.ref_model = ref_model
-        elif self.is_peft_model or precompute_ref_log_probs:
-            # The `model` with adapters turned off will be used as the reference model
-            self.ref_model = None
-        else:
-            self.ref_model = create_reference_model(model)
-
-        if data_collator is None:
-            if tokenizer is None:
-                raise ValueError(
-                    "max_length or a tokenizer must be specified when using the default DPODataCollatorWithPadding"
-                )
-            if max_length is None:
-                warnings.warn(
-                    "When using DPODataCollatorWithPadding, you should set `max_length` in the DPOTrainer's init"
-                    " it will be set to `512` by default, but you should do it yourself in the future.",
-                    UserWarning,
-                )
-                max_length = 512
-            if max_prompt_length is None:
-                warnings.warn(
-                    "When using DPODataCollatorWithPadding, you should set `max_prompt_length` in the DPOTrainer's init"
-                    " it will be set to `128` by default, but you should do it yourself in the future.",
-                    UserWarning,
-                )
-                max_prompt_length = 128
-
-            if max_target_length is None and self.is_encoder_decoder:
-                warnings.warn(
-                    "When using DPODataCollatorWithPadding with an encoder decoder architecture, you should set `max_target_length` in the DPOTrainer's init"
-                    " it will be set to `128` by default, but you should do it yourself in the future.",
-                    UserWarning,
-                )
-                max_target_length = 128
-
-            data_collator = DPODataCollatorWithPadding(
-                pad_token_id=tokenizer.pad_token_id,
-                label_pad_token_id=label_pad_token_id,
-                is_encoder_decoder=self.is_encoder_decoder,
-            )
-
-            if args.remove_unused_columns:
-                args.remove_unused_columns = False
-                # warn users
-                warnings.warn(
-                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
-                    " we have set it for you, but you should do it yourself in the future.",
-                    UserWarning,
-                )
-
-            self.use_dpo_data_collator = True
-        else:
-            self.use_dpo_data_collator = False
-
-        if disable_dropout:
-            disable_dropout_in_model(model)
-            if self.ref_model is not None:
-                disable_dropout_in_model(self.ref_model)
-
-        self.max_length = max_length
-        self.generate_during_eval = generate_during_eval
-        self.label_pad_token_id = label_pad_token_id
-        self.padding_value = padding_value if padding_value is not None else tokenizer.pad_token_id
-        self.max_prompt_length = max_prompt_length
-        self.truncation_mode = truncation_mode
-        self.max_target_length = max_target_length
-        self.tokenizer = tokenizer
-        self.precompute_ref_log_probs = precompute_ref_log_probs
-
-        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
-        # keep track of first called to avoid computation of future calls
-        self._precomputed_train_ref_log_probs = False
-        self._precomputed_eval_ref_log_probs = False
-
-        if loss_type in ["hinge", "ipo", "kto_pair"] and label_smoothing > 0:
-            warnings.warn(
-                "You are using a loss type that does not support label smoothing. Ignoring label_smoothing parameter."
-            )
-
-        self.beta = beta
-        self.label_smoothing = label_smoothing
-        self.loss_type = loss_type
-
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
-
-        # tokenize the dataset
-        train_dataset = train_dataset.map(self.tokenize_row)
-        if eval_dataset is not None:
-            eval_dataset = eval_dataset.map(self.tokenize_row)
-
-        GaudiTrainer.__init__(
-            self,
-            model=model,
-            args=args,
-            gaudi_config=gaudi_config,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            tokenizer=tokenizer,
-            model_init=model_init,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
-
-        if not hasattr(self, "accelerator"):
-            raise AttributeError(
-                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
-            )
-
-        # Deepspeed Zero-3 does not support precompute_ref_log_probs
-        if self.is_deepspeed_enabled:
-            if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs:
-                raise ValueError(
-                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
-                )
-
-        if self.ref_model is None:
-            if not (self.is_peft_model or self.precompute_ref_log_probs):
-                raise ValueError(
-                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
-                )
-        else:
-            if self.is_deepspeed_enabled:
-                self.ref_model = self._prepare_deepspeed(self.ref_model)
-            else:
-                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
-
-                from habana_frameworks.torch.hpu import wrap_in_hpu_graph  # use graph for ref_model
-
-                ref_model = self.accelerator.unwrap_model(self.ref_model)
-                ref_model = wrap_in_hpu_graph(ref_model)
-
-    @staticmethod
-    def concatenated_inputs(
-        batch: Dict[str, Union[List, torch.LongTensor]],
-        is_encoder_decoder: bool = False,
-        label_pad_token_id: int = -100,
-        padding_value: int = 0,
-        device: Optional[torch.device] = None,
-        padded_max_length: int = 0,
-    ) -> Dict[str, torch.LongTensor]:
-        """
-        Copied from DPOTrainer.concatenated_inputs: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/dpo_trainer.py#L701
-        - pad to self.max_length in Gaudi2
-        """
-        concatenated_batch = {}
-
-        if is_encoder_decoder:
-            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
-        else:
-            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
-
-        if padded_max_length != 0:  # pad to max_length in Gaudi
-            max_length = padded_max_length
-        for k in batch:
-            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
-                if "labels" in k or is_encoder_decoder:
-                    pad_value = label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                concatenated_key = k.replace("chosen", "concatenated")
-                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
-        for k in batch:
-            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
-                if "labels" in k or is_encoder_decoder:
-                    pad_value = label_pad_token_id
-                elif k.endswith("_input_ids"):
-                    pad_value = padding_value
-                elif k.endswith("_attention_mask"):
-                    pad_value = 0
-                concatenated_key = k.replace("rejected", "concatenated")
-                concatenated_batch[concatenated_key] = torch.cat(
-                    (
-                        concatenated_batch[concatenated_key],
-                        pad_to_length(batch[k], max_length, pad_value=pad_value),
-                    ),
-                    dim=0,
-                ).to(device=device)
-
-        if is_encoder_decoder:
-            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
-            concatenated_batch["concatenated_attention_mask"] = (
-                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
-            )
-
-        return concatenated_batch
-
-    def concatenated_forward(
-        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
-        """
-        Copied from DPOTrainer.concatenated_forward: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/dpo_trainer.py#L866
-        - pad to self.max_length in Gaudi2
-        """
-        concatenated_batch = self.concatenated_inputs(
-            batch,
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-            padding_value=self.padding_value,
-            device=self.accelerator.device,
-            padded_max_length=self.max_length,
-        )
-        len_chosen = batch["chosen_labels"].shape[0]
-
-        model_kwargs = (
-            {
-                "labels": concatenated_batch["concatenated_labels"],
-                "decoder_input_ids": concatenated_batch.pop("concatenated_decoder_input_ids", None),
-            }
-            if self.is_encoder_decoder
-            else {}
-        )
-        all_logits = model(
-            concatenated_batch["concatenated_input_ids"],
-            attention_mask=concatenated_batch["concatenated_attention_mask"],
-            **model_kwargs,
-        ).logits
-
-        all_logps = self.get_batch_logps(
-            all_logits,
-            concatenated_batch["concatenated_labels"],
-            average_log_prob=False,
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-        )
-
-        chosen_logps = all_logps[:len_chosen]
-        rejected_logps = all_logps[len_chosen:]
-
-        chosen_logits = all_logits[:len_chosen]
-        rejected_logits = all_logits[len_chosen:]
-
-        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits)
diff --git a/optimum/habana/trl/trainer/ppo_config.py b/optimum/habana/trl/trainer/ppo_config.py
deleted file mode 100644
index 49e798be6b..0000000000
--- a/optimum/habana/trl/trainer/ppo_config.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-
-import numpy as np
-from trl import PPOConfig, is_wandb_available
-from trl.trainer.utils import exact_div
-
-
-@dataclass
-class GaudiPPOConfig(PPOConfig):
-    """
-    Configuration class for GaudiPPOTrainer
-    """
-
-    use_habana: bool = False
-    """Indicate if habana is used"""
-    pad_for_acceleration: bool = False
-    """Indicate if padding is used for acceleration. """
-    pad_max_len: int = 0
-    """max total length including padding. Only applicable if pad_for_acceleration is True"""
-    pad_max_input_len: int = 0
-    """max input length including padding. Only applicable if pad_for_acceleration is True"""
-
-    def __post_init__(self):
-        self.backward_batch_size = self.mini_batch_size * self.gradient_accumulation_steps
-        exact_div(
-            self.batch_size,
-            self.backward_batch_size,
-            "`batch_size`",
-            "`mini_batch_size * gradient_accumulation_steps`",
-            "`batch_size` must be a multiple of `mini_batch_size * gradient_accumulation_steps`",
-        )
-        self.total_ppo_epochs = int(np.ceil(self.steps / self.batch_size))
-
-        # check if wandb is installed
-        if self.log_with == "wandb":
-            # raise error if wandb is not installed
-            if not is_wandb_available():
-                raise ImportError(
-                    "Please install wandb to use wandb logging. You can do this by running `pip install wandb`."
-                )
-        self.pad_for_acceleration = (self.pad_max_len > 0) and (self.pad_max_input_len > 0)
-
-        if self.pad_for_acceleration:
-            if self.pad_max_input_len >= self.pad_max_len:
-                raise AssertionError(
-                    "pad_max_input_len ({self.pad_max_input_len}) must be smaller "
-                    " then pad_max_len ({self.pad_max_len})"
-                )
-
-        if self.use_habana:
-            from optimum.habana.transformers.modeling_utils import (
-                adapt_transformers_to_gaudi,
-            )
-
-            adapt_transformers_to_gaudi()
-
-        assert self.kl_penalty in ["kl", "abs", "mse", "full"]
diff --git a/optimum/habana/trl/trainer/ppo_trainer.py b/optimum/habana/trl/trainer/ppo_trainer.py
deleted file mode 100644
index 9f72b02e44..0000000000
--- a/optimum/habana/trl/trainer/ppo_trainer.py
+++ /dev/null
@@ -1,902 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import time
-import typing
-import warnings
-from contextlib import nullcontext
-from typing import Callable, List, Optional, Union
-
-import habana_frameworks.torch as ht
-import numpy as np
-import torch
-from accelerate.utils import ProjectConfiguration
-from datasets import Dataset
-from torch.optim import Adam
-from transformers import (
-    DataCollatorForLanguageModeling,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerBase,
-    PreTrainedTokenizerFast,
-)
-from trl import PPOTrainer
-from trl.core import (
-    WANDB_PADDING,
-    PPODecorators,
-    convert_to_scalar,
-    logprobs_from_logits,
-    stack_dicts,
-    stats_to_np,
-)
-from trl.import_utils import is_torch_greater_2_0
-from trl.models import (
-    SUPPORTED_ARCHITECTURES,
-    PreTrainedModelWrapper,
-    create_reference_model,
-)
-from trl.trainer import (
-    AdaptiveKLController,
-    BaseTrainer,
-    FixedKLController,
-    RunningMoments,
-)
-
-from optimum.habana.utils import set_seed
-
-from . import GaudiPPOConfig
-
-
-_recorded_graph = None
-
-
-class GaudiPPOTrainer(PPOTrainer):
-    def __init__(
-        self,
-        config: GaudiPPOConfig = None,
-        model: PreTrainedModelWrapper = None,
-        ref_model: Optional[PreTrainedModelWrapper] = None,
-        tokenizer: PreTrainedTokenizerBase = None,
-        dataset: Optional[Union[torch.utils.data.Dataset, Dataset]] = None,
-        optimizer: Optional[torch.optim.Optimizer] = None,
-        data_collator: Optional[typing.Callable] = None,
-        num_shared_layers: Optional[int] = None,
-        lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
-    ):
-        """
-        Copied from PPOTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L145
-        The only differences are:
-        - add new args for Gaudi in config
-        - use GaudiAccelerator instead of Accelerator
-        """
-        BaseTrainer.__init__(self, config)
-
-        # initial seed for reproducible experiments
-        set_seed(config.seed)
-
-        # Step 0: check positional arguments validity
-        if not isinstance(config, GaudiPPOConfig):
-            raise ValueError(f"config must be a PPOConfig, got {type(config)}")
-        if not isinstance(tokenizer, (PreTrainedTokenizerBase)):
-            raise ValueError(
-                f"tokenizer must be a PreTrainedTokenizerBase like a PreTrainedTokenizer or a PreTrainedTokenizerFast, got {type(tokenizer)}"
-            )
-        if not isinstance(model, (SUPPORTED_ARCHITECTURES)):
-            raise ValueError(
-                f"model must be a PreTrainedModelWrapper, got {type(model)} - supported architectures are: {SUPPORTED_ARCHITECTURES}"
-            )
-        # Step 1: Initialize Accelerator
-        if config.use_habana:
-            from optimum.habana.accelerate import GaudiAccelerator as Accelerator
-        else:
-            from accelerate import Accelerator
-        self.accelerator = Accelerator(
-            log_with=config.log_with,
-            gradient_accumulation_steps=config.gradient_accumulation_steps,
-            project_config=ProjectConfiguration(**config.project_kwargs),
-            **config.accelerator_kwargs,
-        )
-
-        # Step 1.1 Runtime variables filled by the accelerator
-        config.world_size = self.accelerator.num_processes
-        config.global_backward_batch_size = config.backward_batch_size * config.world_size
-        config.global_batch_size = config.batch_size * config.world_size
-
-        self.model = model.to(self.accelerator.device.type)
-        self.model_params = filter(lambda p: p.requires_grad, self.model.parameters())
-        self.is_encoder_decoder = hasattr(self.model, "is_encoder_decoder")
-        self.is_peft_model = getattr(self.model, "is_peft_model", False)
-        config.is_encoder_decoder = self.is_encoder_decoder
-        config.is_peft_model = self.is_peft_model
-
-        is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard"
-        self.accelerator.init_trackers(
-            config.tracker_project_name,
-            config=({"trl_ppo_trainer_config": config.to_dict()} if not is_using_tensorboard else config.to_dict()),
-            init_kwargs=config.tracker_kwargs,
-        )
-        self.is_using_text_environment = getattr(config, "use_text_environment", False)
-
-        if isinstance(ref_model, SUPPORTED_ARCHITECTURES):
-            self.ref_model = ref_model.to(self.accelerator.device.type)
-            if num_shared_layers is not None:
-                warnings.warn(
-                    "num_shared_layers is ignored when ref_model is provided. Two different models are used for the "
-                    "model and the reference model and no layers are shared.",
-                    UserWarning,
-                )
-        elif ref_model is None and not self.is_peft_model:
-            self.ref_model = create_reference_model(self.model, num_shared_layers=num_shared_layers)
-        elif self.is_peft_model:
-            self.ref_model = None
-        else:
-            raise ValueError(
-                f"ref_model must be a PreTrainedModelWrapper or `None`, got {type(ref_model)} - supported "
-                f"architectures are: {SUPPORTED_ARCHITECTURES} "
-            )
-        self.optional_peft_ctx = (
-            self.accelerator.unwrap_model(self.model).pretrained_model.disable_adapter
-            if self.is_peft_model
-            else nullcontext
-        )
-
-        if not (isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast)):
-            raise ValueError(
-                "tokenizer must be a transformers.PreTrainedTokenizer or transformers.PreTrainedTokenizerFast"
-            )
-        self.tokenizer = tokenizer
-
-        if dataset is not None and not (isinstance(dataset, torch.utils.data.Dataset) or isinstance(dataset, Dataset)):
-            raise ValueError("dataset must be a torch.utils.data.Dataset or datasets.Dataset")
-        elif dataset is None:
-            warnings.warn(
-                "No dataset is provided. Make sure to set config.batch_size to the correct value before training.",
-                UserWarning,
-            )
-        self.dataset = dataset
-        self._signature_columns = None
-        if self.dataset is not None:
-            self.dataloader = self.prepare_dataloader(self.dataset, data_collator)
-        elif self.dataset is None and self.accelerator.num_processes > 1:
-            warnings.warn(
-                "No dataset is provided. In a multi-GPU setting, this will lead to an error. You should"
-                " prepare your dataloader yourself with `dataloader = ppo_trainer.accelerator.prepare(dataloader)`"
-                " and using `torch.utils.data.DataLoader`, or pass a dataset to the `PPOTrainer`. Please "
-                " refer to the documentation for more details.",
-                UserWarning,
-            )
-            self.dataloader = None
-        else:
-            self.dataloader = None
-
-        # Step 3: Initialize optimizer and data collator
-        self.data_collator = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
-        if optimizer is None:
-            self.optimizer = Adam(
-                filter(lambda p: p.requires_grad, self.model.parameters()),
-                lr=self.config.learning_rate,
-            )
-        else:
-            self.optimizer = optimizer
-
-        self.lr_scheduler = lr_scheduler
-        if self.lr_scheduler is not None:
-            lr_scheduler_class = (
-                torch.optim.lr_scheduler._LRScheduler
-                if not is_torch_greater_2_0()
-                else torch.optim.lr_scheduler.LRScheduler
-            )
-
-            if not isinstance(self.lr_scheduler, lr_scheduler_class):
-                raise ValueError(
-                    "lr_scheduler must be a torch.optim.lr_scheduler._LRScheduler or torch.optim.lr_scheduler.LRScheduler (for torch >= 2.0)"
-                )
-
-        if self.config.adap_kl_ctrl:
-            self.kl_ctl = AdaptiveKLController(self.config.init_kl_coef, self.config.target, self.config.horizon)
-        else:
-            self.kl_ctl = FixedKLController(self.config.init_kl_coef)
-
-        if self.accelerator.distributed_type == "MULTI_HPU":
-            from accelerate.utils import DistributedDataParallelKwargs
-
-            kwargs = {}
-            kwargs["find_unused_parameters"] = True
-            kwargs["gradient_as_bucket_view"] = True
-            self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
-
-        # Safety checkers for DS integration
-        is_deepspeed_used = self.accelerator.distributed_type == "DEEPSPEED" and hasattr(
-            self.accelerator.state, "deepspeed_plugin"
-        )
-
-        (
-            self.model,
-            self.optimizer,
-            self.data_collator,
-            self.dataloader,
-            self.lr_scheduler,
-        ) = self.accelerator.prepare(
-            self.model,
-            self.optimizer,
-            self.data_collator,
-            self.dataloader,
-            self.lr_scheduler,
-        )
-        if is_deepspeed_used:
-            # Quantized models are already set on the correct device
-            if not self.is_peft_model and not (
-                getattr(self.ref_model.pretrained_model, "is_loaded_in_8bit", False)
-                or getattr(self.ref_model.pretrained_model, "is_loaded_in_4bit", False)
-            ):
-                self.ref_model = self._prepare_deepspeed(self.ref_model)
-        else:
-            self.ref_model = self.accelerator.prepare(self.ref_model)
-
-        # In a distributed setup, only logging needs to be performed on the main process
-        # check: https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
-        # or: https://discuss.pytorch.org/t/use-distributed-data-parallel-correctly/82500/11
-        self.is_distributed = self.accelerator.num_processes > 1
-
-        # init the current step
-        self.current_step = 0
-
-        # init variables for pushing model to hub
-        if config.push_to_hub_if_best_kwargs:
-            if "repo_id" not in config.push_to_hub_if_best_kwargs:
-                raise ValueError("You have to specify repo_id in order to push the model to the hub!")
-            self.push_to_hub_kwargs = config.push_to_hub_if_best_kwargs
-            self.compare_step = 0
-            self.highest_reward = torch.tensor(-float("inf"))
-
-        # post process for PP
-        if not getattr(self.model, "is_sequential_parallel", False):
-            self.current_device = self.accelerator.device
-        else:
-            if self.accelerator.device.type == "hpu":
-                self.current_device = torch.device("hpu")
-            else:
-                self.current_device = torch.device("cpu")
-
-        PPODecorators.optimize_device_cache = self.config.optimize_device_cache
-
-        self.running = RunningMoments(self.accelerator)
-        if config.use_habana:
-            import habana_frameworks.torch.core as htcore
-
-            self.htcore = htcore
-
-    def generate(
-        self,
-        query_tensor: Union[torch.Tensor, List[torch.Tensor]],
-        length_sampler: Callable = None,
-        batch_size: int = 4,
-        return_prompt: bool = True,
-        generate_ref_response: bool = False,
-        **generation_kwargs,
-    ):
-        """
-        Copied from PPOTrainer.generate: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L433
-        The only differences are:
-        - add hpu graph for acceleration
-        """
-        if generate_ref_response:
-            ref_model = self.model if self.is_peft_model else self.ref_model
-        if isinstance(query_tensor, List):
-            if self.config.use_habana:
-                self.wrap_generation_for_hpu_graph_mode(self.model)
-            response = self._generate_batched(
-                self.model,
-                query_tensor,
-                length_sampler=length_sampler,
-                batch_size=batch_size,
-                return_prompt=return_prompt,
-                **generation_kwargs,
-            )
-            if generate_ref_response:
-                with self.optional_peft_ctx():
-                    if self.config.use_habana:
-                        self.wrap_generation_for_hpu_graph_mode(ref_model)
-                    ref_response = self._generate_batched(
-                        ref_model,
-                        query_tensor,
-                        length_sampler=length_sampler,
-                        batch_size=batch_size,
-                        return_prompt=return_prompt,
-                        **generation_kwargs,
-                    )
-
-        else:
-            if len(query_tensor.shape) == 2:
-                raise ValueError(
-                    "query_tensor must be a tensor of shape (`seq_len`) or a list of tensors of shape (`seq_len`)"
-                )
-
-            if length_sampler is not None:
-                generation_kwargs["max_new_tokens"] = length_sampler()
-            if self.config.use_habana:
-                self.wrap_generation_for_hpu_graph_mode(self.model)
-            response = self.accelerator.unwrap_model(self.model).generate(
-                input_ids=query_tensor.unsqueeze(dim=0), **generation_kwargs
-            )
-            if generate_ref_response:
-                with self.optional_peft_ctx():
-                    if self.config.use_habana:
-                        self.wrap_generation_for_hpu_graph_mode(ref_model)
-                    ref_response = ref_model.generate(input_ids=query_tensor.unsqueeze(dim=0), **generation_kwargs)
-
-            if not return_prompt and not self.is_encoder_decoder:
-                response = response[:, query_tensor.shape[0] :]
-                if generate_ref_response:
-                    ref_response = ref_response[:, query_tensor.shape[0] :]
-
-        if generate_ref_response:
-            return response, ref_response
-        return response
-
-    def _generate_batched(
-        self,
-        model: PreTrainedModelWrapper,
-        query_tensors: List[torch.Tensor],
-        length_sampler: Callable = None,
-        batch_size: int = 4,
-        return_prompt: bool = True,
-        pad_to_multiple_of: int = None,
-        remove_padding: bool = True,
-        **generation_kwargs,
-    ):
-        """
-        Copied from PPOTrainer._generate_batched: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L509
-        The only differences are:
-        - pad to pad_max_input_len to get static shape for generation acceleration
-        - use lazy mode and hpu_graphs for generation in hpu
-        """
-        outputs = []
-
-        padding_side_default = self.tokenizer.padding_side
-        if not self.is_encoder_decoder:
-            self.tokenizer.padding_side = "left"
-
-        # in case we have fewer examples than bs
-        batch_size = min(len(query_tensors), batch_size)
-
-        for i in range(0, len(query_tensors), batch_size):
-            if length_sampler is not None:
-                generation_kwargs["max_new_tokens"] = length_sampler()
-
-            # prevent overflow if query tensors are not even multiple of bs
-            end_index = min(len(query_tensors), i + batch_size)
-
-            batch = query_tensors[i:end_index]
-            batch_mask = [torch.ones_like(element) for element in batch]
-            inputs = {"input_ids": batch, "attention_mask": batch_mask}
-
-            if self.config.pad_for_acceleration and self.config.pad_max_input_len > 0:
-                padded_inputs = self.tokenizer.pad(
-                    inputs,
-                    padding="max_length",
-                    max_length=self.config.pad_max_input_len,
-                    pad_to_multiple_of=pad_to_multiple_of,
-                    return_tensors="pt",
-                ).to(self.current_device)
-            else:
-                padded_inputs = self.tokenizer.pad(
-                    inputs,
-                    padding=True,
-                    max_length=None,
-                    pad_to_multiple_of=pad_to_multiple_of,
-                    return_tensors="pt",
-                ).to(self.current_device)
-
-            if self.config.use_habana:
-                generation_kwargs["ignore_eos"] = False
-                generation_kwargs["lazy_mode"] = True
-                generation_kwargs["hpu_graphs"] = True
-
-            generations = self.accelerator.unwrap_model(model).generate(**padded_inputs, **generation_kwargs)
-
-            for generation, mask in zip(generations, padded_inputs["attention_mask"]):
-                if not self.is_encoder_decoder:
-                    output = generation[(1 - mask).sum() :]  # remove padding
-                else:
-                    output = generation
-
-                if not return_prompt and not self.is_encoder_decoder:
-                    output = output[(mask).sum() :]  # remove prompt
-
-                if remove_padding and self.tokenizer.eos_token_id in output:
-                    pad_mask = output == self.tokenizer.eos_token_id
-                    pad_start = torch.nonzero(pad_mask, as_tuple=False)[0, 0].item()
-                    output = output[: pad_start + 1]  # keep the eos token at the end
-
-                outputs.append(output)
-
-        self.tokenizer.padding_side = padding_side_default
-        return outputs
-
-    @PPODecorators.empty_device_cache()
-    def step(
-        self,
-        queries: List[torch.LongTensor],
-        responses: List[torch.LongTensor],
-        scores: List[torch.FloatTensor],
-        response_masks: Optional[List[torch.LongTensor]] = None,
-    ):
-        """
-        Copied from PPOTrainer.step: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L620
-        The only differences are:
-        - use hpu_graphs for sampling and training
-        - remove duplicated padding if padding is done in prepare_model_inputs
-        """
-        bs = self.config.batch_size
-
-        queries, responses, scores, response_masks = self._step_safety_checker(
-            bs, queries, responses, scores, response_masks
-        )
-        scores = torch.tensor(scores, device=self.current_device)
-        if self.config.use_score_scaling:
-            # Score scaling
-            scores_mean, scores_std = self.running.update(scores)
-            tensor_to_kwargs = {"dtype": scores.dtype, "device": scores.device}
-            score_scaling_factor = self.running.std.to(**tensor_to_kwargs) + torch.finfo(scores.dtype).eps
-            if self.config.use_score_norm:
-                scores = (scores - self.running.mean.to(**tensor_to_kwargs)) / score_scaling_factor
-            else:
-                scores /= score_scaling_factor
-
-        if self.config.score_clip is not None:
-            # Score clipping
-            scores_dtype = scores.dtype
-            scores = torch.clip(scores.float(), -self.config.score_clip, self.config.score_clip).to(dtype=scores_dtype)
-
-        # if we want to push best model to the hub
-        if hasattr(self, "highest_reward"):
-            if self.compare_step % self.config.compare_steps == 0:
-                curr_mean_reward = scores.mean()
-                # if the best reward ever seen
-                if curr_mean_reward > self.highest_reward:
-                    self.highest_reward = curr_mean_reward
-                    # push model to hub
-                    self.push_to_hub(**self.push_to_hub_kwargs)
-            self.compare_step += 1
-
-        timing = {}
-        t0 = time.time()
-
-        t = time.time()
-
-        model_inputs = self.prepare_model_inputs(queries, responses)
-
-        if self.is_distributed and not self.config.pad_for_acceleration:
-            pad_first = self.tokenizer.padding_side == "left"
-
-            model_inputs["input_ids"] = self.accelerator.pad_across_processes(
-                model_inputs["input_ids"],
-                dim=1,
-                pad_index=self.tokenizer.pad_token_id,
-                pad_first=pad_first,
-            )
-            model_inputs["attention_mask"] = self.accelerator.pad_across_processes(
-                model_inputs["attention_mask"], dim=1, pad_index=0, pad_first=pad_first
-            )
-            if self.is_encoder_decoder:
-                model_inputs["decoder_input_ids"] = self.accelerator.pad_across_processes(
-                    model_inputs["decoder_input_ids"],
-                    dim=1,
-                    pad_index=self.tokenizer.pad_token_id,
-                    pad_first=pad_first,
-                )
-                model_inputs["decoder_attention_mask"] = self.accelerator.pad_across_processes(
-                    model_inputs["decoder_attention_mask"],
-                    dim=1,
-                    pad_index=0,
-                    pad_first=pad_first,
-                )
-
-        model_inputs_names = list(model_inputs.keys())
-
-        full_kl_penalty = self.config.kl_penalty == "full"
-
-        with torch.no_grad():
-            if self.config.use_habana:
-                self.unwrap_generation_for_hpu_graph_mode(self.model)
-                self.wrap_fw_for_hpu_graph_mode(self.model)
-                if self.ref_model is not None:
-                    self.unwrap_generation_for_hpu_graph_mode(self.ref_model)
-                    self.wrap_fw_for_hpu_graph_mode(self.ref_model)
-            all_logprobs, logits_or_none, values, masks = self.batched_forward_pass(
-                self.model,
-                queries,
-                responses,
-                model_inputs,
-                response_masks=response_masks,
-                return_logits=full_kl_penalty,
-            )
-            with self.optional_peft_ctx():
-                ref_logprobs, ref_logits_or_none, _, _ = self.batched_forward_pass(
-                    self.model if self.is_peft_model else self.ref_model,
-                    queries,
-                    responses,
-                    model_inputs,
-                    return_logits=full_kl_penalty,
-                )
-
-        timing["time/ppo/forward_pass"] = time.time() - t
-
-        with torch.no_grad():
-            t = time.time()
-            if full_kl_penalty:
-                active_full_logprobs = logprobs_from_logits(logits_or_none, None, gather=False)
-                ref_full_logprobs = logprobs_from_logits(ref_logits_or_none, None, gather=False)
-
-                rewards, non_score_reward, kls = self.compute_rewards(
-                    scores, active_full_logprobs, ref_full_logprobs, masks
-                )
-            else:
-                rewards, non_score_reward, kls = self.compute_rewards(scores, all_logprobs, ref_logprobs, masks)
-            timing["time/ppo/compute_rewards"] = time.time() - t
-
-            t = time.time()
-            values, advantages, returns = self.compute_advantages(values, rewards, masks)
-            timing["time/ppo/compute_advantages"] = time.time() - t
-
-        # upcast to float32 to avoid dataset issues
-        batch_dict = {
-            "queries": queries,
-            "responses": responses,
-            "logprobs": all_logprobs.to(torch.float32),
-            "values": values.to(torch.float32),
-            "masks": masks,
-            "advantages": advantages,
-            "returns": returns,
-        }
-        batch_dict.update(model_inputs)
-
-        t = time.time()
-        all_stats = []
-        early_stop = False
-        if self.config.use_habana:
-            self.unwrap_fw_for_hpu_graph_mode(self.model)
-            import habana_frameworks.torch as ht
-
-            model = self.accelerator.unwrap_model(self.model)
-            if not hasattr(model, "wrap_train_in_graph"):
-                model = ht.hpu.wrap_in_hpu_graph(model)
-                setattr(model, "wrap_train_in_graph", model.forward)
-            else:
-                model.forward = getattr(model, "wrap_train_in_graph")
-
-        for _ in range(self.config.ppo_epochs):
-            if early_stop:
-                break
-            b_inds = np.random.permutation(bs)
-            for backward_batch_start in range(0, bs, self.config.backward_batch_size):
-                backward_batch_end = backward_batch_start + self.config.backward_batch_size
-                backward_batch_inds = b_inds[backward_batch_start:backward_batch_end]
-
-                for mini_batch_start in range(0, self.config.backward_batch_size, self.config.mini_batch_size):
-                    mini_batch_end = mini_batch_start + self.config.mini_batch_size
-                    mini_batch_inds = backward_batch_inds[mini_batch_start:mini_batch_end]
-                    mini_batch_dict = {
-                        "logprobs": batch_dict["logprobs"][mini_batch_inds],
-                        "values": batch_dict["values"][mini_batch_inds],
-                        "masks": batch_dict["masks"][mini_batch_inds],
-                        # hacks: the queries and responses are ragged.
-                        "queries": [batch_dict["queries"][i] for i in mini_batch_inds],
-                        "responses": [batch_dict["responses"][i] for i in mini_batch_inds],
-                        "advantages": batch_dict["advantages"][mini_batch_inds],
-                        "returns": batch_dict["returns"][mini_batch_inds],
-                    }
-                    for k in model_inputs_names:
-                        mini_batch_dict[k] = batch_dict[k][mini_batch_inds]
-                    with self.accelerator.accumulate(self.model):
-                        model_inputs = {k: mini_batch_dict[k] for k in model_inputs_names}
-
-                        logprobs, logits, vpreds, _ = self.batched_forward_pass(
-                            self.model,
-                            mini_batch_dict["queries"],
-                            mini_batch_dict["responses"],
-                            model_inputs,
-                            return_logits=True,
-                        )
-                        train_stats = self.train_minibatch(
-                            mini_batch_dict["logprobs"],
-                            mini_batch_dict["values"],
-                            logprobs,
-                            logits,
-                            vpreds,
-                            mini_batch_dict["masks"],
-                            mini_batch_dict["advantages"],
-                            mini_batch_dict["returns"],
-                        )
-                        all_stats.append(train_stats)
-
-            # typically, early stopping is done at the epoch level
-            if self.config.early_stopping:
-                policykl = train_stats["policy/policykl"]
-                early_stop = self._early_stop(policykl)
-                if early_stop:
-                    break
-
-        timing["time/ppo/optimize_step"] = time.time() - t
-
-        t = time.time()
-        train_stats = stack_dicts(all_stats)
-
-        # reshape advantages/ratios such that they are not averaged.
-        train_stats["policy/advantages"] = torch.flatten(train_stats["policy/advantages"]).unsqueeze(0)
-        train_stats["policy/advantages"] = torch.nan_to_num(train_stats["policy/advantages"], WANDB_PADDING)
-        train_stats["policy/ratio"] = torch.flatten(train_stats["policy/ratio"]).unsqueeze(0)
-
-        stats = self.record_step_stats(
-            scores=scores,
-            logprobs=all_logprobs,
-            ref_logprobs=ref_logprobs,
-            non_score_reward=non_score_reward,
-            train_stats=train_stats,
-            kl_coef=self.kl_ctl.value,
-            masks=masks,
-            queries=queries,
-            responses=responses,
-            kls=kls,
-        )
-        # Gather/Reduce stats from all processes
-        if self.is_distributed:
-            stats = self.gather_stats(stats)
-        stats = stats_to_np(stats)
-        timing["time/ppo/calc_stats"] = time.time() - t
-        stats["ppo/learning_rate"] = self.optimizer.param_groups[0]["lr"]
-
-        # Update the KL control - multiply the batch_size by the number of processes
-        self.kl_ctl.update(
-            stats["objective/kl"],
-            self.config.batch_size * self.accelerator.num_processes,
-        )
-
-        # Log the total ppo time
-        timing["time/ppo/total"] = time.time() - t0
-        stats.update(timing)
-
-        # post-process stats for tensorboard and other loggers
-        if self.config.log_with != "wandb":
-            stats = convert_to_scalar(stats)
-
-        if self.lr_scheduler is not None:
-            self.lr_scheduler.step()
-
-        return stats
-
-    def prepare_model_inputs(self, queries: torch.Tensor, responses: torch.Tensor):
-        """
-        Copied from PPOTrainer.prepare_model_inputs: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L921
-        The only differences are:
-        - add padding to model inputs for static shape support in forward
-        """
-        if self.is_encoder_decoder:
-            input_data = self.data_collator(
-                [{"input_ids": q, "attention_mask": torch.ones_like(q)} for q in queries]
-            ).to(self.current_device)
-
-            decoder_inputs = self.data_collator(
-                [{"input_ids": r, "attention_mask": torch.ones_like(r)} for r in responses]
-            ).to(self.current_device)
-
-            input_data["decoder_input_ids"] = decoder_inputs["input_ids"]
-            input_data["decoder_attention_mask"] = decoder_inputs["attention_mask"]
-        else:
-            input_ids = [torch.cat([q, r]) for q, r in zip(queries, responses)]
-            input_data = self.data_collator(
-                [{"input_ids": ids, "attention_mask": torch.ones_like(ids)} for ids in input_ids]
-            ).to(self.current_device)
-
-        if self.config.pad_for_acceleration:
-            input_data["input_ids"] = torch.nn.functional.pad(
-                input_data["input_ids"],
-                (0, self.config.pad_max_len - input_data["input_ids"].shape[1]),
-                value=self.tokenizer.pad_token_id,
-            )
-            input_data["attention_mask"] = torch.nn.functional.pad(
-                input_data["attention_mask"],
-                (
-                    0,
-                    self.config.pad_max_len - input_data["attention_mask"].shape[1],
-                ),
-                value=0,
-            )
-            if self.is_encoder_decoder:
-                input_data["decoder_input_ids"] = torch.nn.functional.pad(
-                    input_data["decoder_input_ids"],
-                    (
-                        0,
-                        self.config.pad_max_len - input_data["decoder_input_ids"].shape[1],
-                    ),
-                    value=self.tokenizer.pad_token_id,
-                )
-                input_data["decoder_attention_mask"] = torch.nn.functional.pad(
-                    input_data["decoder_attention_mask"],
-                    (
-                        0,
-                        self.config.pad_max_len - input_data["decoder_attention_mask"].shape[1],
-                    ),
-                    value=0,
-                )
-
-        input_data.pop("labels", None)  # we don't want to compute LM losses
-        return input_data
-
-    @PPODecorators.empty_device_cache()
-    def batched_forward_pass(
-        self,
-        model: PreTrainedModelWrapper,
-        queries: torch.Tensor,
-        responses: torch.Tensor,
-        model_inputs: dict,
-        return_logits: bool = False,
-        response_masks: Optional[torch.Tensor] = None,
-    ):
-        """
-        Copied from PPOTrainer.batched_forward_pass: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L943
-        The only differences are:
-        - input_kwargs/output need to clone() to avoid overidden in hpu
-        """
-        bs = len(queries)
-        fbs = self.config.mini_batch_size
-        all_logprobs = []
-        all_logits = []
-        all_masks = []
-        all_values = []
-
-        model.eval()
-
-        for i in range(math.ceil(bs / fbs)):
-            input_kwargs = {key: value[i * fbs : (i + 1) * fbs].clone() for key, value in model_inputs.items()}
-            query_batch = queries[i * fbs : (i + 1) * fbs]
-            response_batch = responses[i * fbs : (i + 1) * fbs]
-            if response_masks is not None:
-                response_masks_batch = response_masks[i * fbs : (i + 1) * fbs]
-            logits, _, values = model(**input_kwargs)
-
-            if self.is_encoder_decoder:
-                input_ids = input_kwargs["decoder_input_ids"]
-                attention_mask = input_kwargs["decoder_attention_mask"]
-            else:
-                input_ids = input_kwargs["input_ids"]
-                attention_mask = input_kwargs["attention_mask"]
-
-            logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:])
-            masks = torch.zeros_like(attention_mask)
-            masks[:, :-1] = attention_mask[:, 1:]
-
-            for j in range(len(query_batch)):
-                if self.is_encoder_decoder:
-                    # Decoder sentence starts always in the index 1 after padding in the Enc-Dec Models
-                    start = 1
-                    end = attention_mask[j, :].sum() - 1
-                else:
-                    start = len(query_batch[j]) - 1  # logprobs starts from the second query token
-                    if attention_mask[j, 0] == 0:  # offset left padding
-                        start += attention_mask[j, :].nonzero()[0]
-                    end = start + len(response_batch[j])
-                    if response_masks is not None:
-                        response_masks_batch[j] = torch.cat(
-                            (torch.zeros_like(query_batch[j]), response_masks_batch[j])
-                        )[1:]
-
-                masks[j, :start] = 0
-                masks[j, end:] = 0
-                if response_masks is not None:
-                    masks[j, start:end] = masks[j, start:end] * response_masks_batch[j][start:end]
-
-            if return_logits:
-                all_logits.append(logits.clone())
-            else:
-                del logits
-            all_values.append(values.clone())
-            all_logprobs.append(logprobs)
-            all_masks.append(masks)
-
-        return (
-            torch.cat(all_logprobs),
-            torch.cat(all_logits)[:, :-1] if return_logits else None,
-            torch.cat(all_values)[:, :-1],
-            torch.cat(all_masks)[:, :-1],
-        )
-
-    @PPODecorators.empty_device_cache()
-    def train_minibatch(
-        self,
-        old_logprobs: torch.FloatTensor,
-        values: torch.FloatTensor,
-        logprobs: torch.FloatTensor,
-        logits: torch.FloatTensor,
-        vpreds: torch.FloatTensor,
-        mask: torch.LongTensor,
-        advantages: torch.FloatTensor,
-        returns: torch.FloatTensor,
-    ):
-        """
-        Copied from PPOTrainer.batched_forward_pass: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/ppo_trainer.py#L1034
-        The only differences are:
-        - add htcore.mark_step
-        """
-        self.model.train()
-        loss_p, loss_v, train_stats = self.loss(
-            old_logprobs, values, logits, vpreds, logprobs, mask, advantages, returns
-        )
-        loss = loss_p + loss_v
-        global _recorded_graph
-
-        if _recorded_graph is None:
-            _recorded_graph = ht.hpu.HPUGraph()
-            s = ht.hpu.default_stream()
-
-            with ht.hpu.stream(s):
-                _recorded_graph.capture_begin()
-                self.accelerator.backward(loss)
-                _recorded_graph.capture_end()
-        else:
-            _recorded_graph.replay()
-        if self.config.max_grad_norm is not None:
-            if self.accelerator.sync_gradients:
-                self.accelerator.clip_grad_norm_(self.model_params, self.config.max_grad_norm)
-        self.optimizer.step()
-        if self.config.use_habana:
-            self.htcore.mark_step()
-        # we call optimizer.zero_grad() every time and let `accelerator` handle accumulation
-        # see https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation#the-finished-code
-        self.optimizer.zero_grad()
-        return train_stats
-
-    def wrap_fw_for_hpu_graph_mode(self, model: PreTrainedModelWrapper):
-        model = self.accelerator.unwrap_model(model)
-        if hasattr(model, "hpu_graph_fw"):
-            model.forward = model.hpu_graph_fw
-        else:
-            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-            model.orig_fw = model.forward
-            model = wrap_in_hpu_graph(model)
-            model.hpu_graph_fw = model.forward
-
-    def unwrap_fw_for_hpu_graph_mode(self, model: PreTrainedModelWrapper):
-        model = self.accelerator.unwrap_model(model)
-        if hasattr(model, "orig_fw"):
-            model.forward = model.orig_fw
-
-    def wrap_generation_for_hpu_graph_mode(self, model: PreTrainedModelWrapper):
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        model = self.accelerator.unwrap_model(model)
-        if getattr(model, "is_peft_model", False):
-            if hasattr(model.pretrained_model.base_model.model, "hpu_graph_fw"):
-                model.pretrained_model.base_model.model.forward = model.pretrained_model.base_model.model.hpu_graph_fw
-            else:
-                model.pretrained_model.base_model.model.orig_fw = model.pretrained_model.base_model.model.forward
-                model.pretrained_model.base_model.model = wrap_in_hpu_graph(model.pretrained_model.base_model.model)
-                model.pretrained_model.base_model.model.hpu_graph_fw = model.pretrained_model.base_model.model.forward
-        else:
-            if hasattr(model.pretrained_model, "hpu_graph_fw"):
-                model.pretrained_model.forward = model.pretrained_model.hpu_graph_fw
-            else:
-                model.pretrained_model.orig_fw = model.pretrained_model.forward
-                model.pretrained_model = wrap_in_hpu_graph(model.pretrained_model)
-                model.pretrained_model.hpu_graph_fw = model.pretrained_model.forward
-
-    def unwrap_generation_for_hpu_graph_mode(self, model: PreTrainedModelWrapper):
-        model = self.accelerator.unwrap_model(model)
-        if getattr(model, "is_peft_model", False):
-            if hasattr(model.pretrained_model.base_model.model, "orig_fw"):
-                model.pretrained_model.base_model.model.forward = model.pretrained_model.base_model.model.orig_fw
-        else:
-            if hasattr(model.pretrained_model, "orig_fw"):
-                model.pretrained_model.forward = model.pretrained_model.orig_fw
diff --git a/optimum/habana/trl/trainer/reward_trainer.py b/optimum/habana/trl/trainer/reward_trainer.py
deleted file mode 100644
index bbb0c761fe..0000000000
--- a/optimum/habana/trl/trainer/reward_trainer.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
-
-import torch.nn as nn
-from transformers import (
-    PreTrainedTokenizerBase,
-)
-from transformers.utils import PaddingStrategy
-
-from optimum.habana import GaudiTrainer
-
-
-class GaudiRewardTrainer(GaudiTrainer):
-    """
-    Copied from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py#L266
-    """
-
-    def compute_loss(self, model, inputs, return_outputs=False):
-        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
-        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
-        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
-        if return_outputs:
-            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
-        return loss
-
-
-@dataclass
-class RewardDataCollatorWithPadding:
-    """
-    Copied from https://github.com/huggingface/trl/blob/v0.7.6/examples/research_projects/stack_llama/scripts/reward_modeling.py#L206
-    """
-
-    tokenizer: PreTrainedTokenizerBase
-    padding: Union[bool, str, PaddingStrategy] = True
-    max_length: Optional[int] = None
-    pad_to_multiple_of: Optional[int] = None
-    return_tensors: str = "pt"
-
-    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
-        features_j = []
-        features_k = []
-        for feature in features:
-            features_j.append(
-                {
-                    "input_ids": feature["input_ids_j"],
-                    "attention_mask": feature["attention_mask_j"],
-                }
-            )
-            features_k.append(
-                {
-                    "input_ids": feature["input_ids_k"],
-                    "attention_mask": feature["attention_mask_k"],
-                }
-            )
-        batch_j = self.tokenizer.pad(
-            features_j,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=self.return_tensors,
-        )
-        batch_k = self.tokenizer.pad(
-            features_k,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=self.return_tensors,
-        )
-        batch = {
-            "input_ids_j": batch_j["input_ids"],
-            "attention_mask_j": batch_j["attention_mask"],
-            "input_ids_k": batch_k["input_ids"],
-            "attention_mask_k": batch_k["attention_mask"],
-            "return_loss": True,
-        }
-        return batch
diff --git a/optimum/habana/trl/trainer/sft_trainer.py b/optimum/habana/trl/trainer/sft_trainer.py
deleted file mode 100644
index c6728f1ce2..0000000000
--- a/optimum/habana/trl/trainer/sft_trainer.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import dataclasses
-import inspect
-import warnings
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from datasets import Dataset
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    DataCollator,
-    DataCollatorForLanguageModeling,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-)
-from transformers.trainer_callback import TrainerCallback
-from transformers.trainer_utils import EvalPrediction
-from trl import SFTTrainer
-from trl.import_utils import is_peft_available
-from trl.trainer.utils import (
-    DataCollatorForCompletionOnlyLM,
-)
-
-
-if is_peft_available():
-    from peft import PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
-
-from ... import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-
-
-class GaudiSFTTrainer(SFTTrainer, GaudiTrainer):
-    def __init__(
-        self,
-        model: Union[PreTrainedModel, nn.Module, str] = None,
-        args: GaudiTrainingArguments = None,
-        gaudi_config: GaudiConfig = None,
-        data_collator: Optional[DataCollator] = None,
-        train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-        model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-        peft_config: Optional["PeftConfig"] = None,
-        dataset_text_field: Optional[str] = None,
-        packing: Optional[bool] = False,
-        formatting_func: Optional[Callable] = None,
-        max_seq_length: Optional[int] = None,
-        infinite: Optional[bool] = None,
-        num_of_sequences: Optional[int] = 1024,
-        chars_per_token: Optional[float] = 3.6,
-        dataset_num_proc: Optional[int] = None,
-        dataset_batch_size: int = 1000,
-        neftune_noise_alpha: Optional[float] = None,
-        model_init_kwargs: Optional[Dict] = None,
-        dataset_kwargs: Optional[Dict] = None,
-    ):
-        """
-        Copied from SFTTrainer.__init__: https://github.com/huggingface/trl/blob/v0.7.6/trl/trainer/sft_trainer.py#L120
-        The only differences are:
-        - add new args gaudi_config
-        - use GaudiTrainer instead of Trainer
-        - cast peft model to bf16.
-        """
-        if model_init_kwargs is None:
-            model_init_kwargs = {}
-        elif not isinstance(model, str):
-            raise ValueError("You passed model_kwargs to the SFTTrainer. But your model is already instantiated.")
-
-        if infinite is not None:
-            warnings.warn(
-                "The `infinite` argument is deprecated and will be removed in a future version of TRL. Use `TrainingArguments.max_steps` or `TrainingArguments.num_train_epochs` instead to control training length."
-            )
-
-        if isinstance(model, str):
-            warnings.warn(
-                "You passed a model_id to the SFTTrainer. This will automatically create an "
-                "`AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you."
-            )
-            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
-
-        if packing and data_collator is not None and isinstance(data_collator, DataCollatorForCompletionOnlyLM):
-            raise ValueError(
-                "You passed a `DataCollatorForCompletionOnlyLM` to the SFTTrainer. This is not compatible with the `packing` argument."
-            )
-
-        if is_peft_available() and peft_config is not None:
-            if not isinstance(peft_config, PeftConfig):
-                raise ValueError(
-                    "If you want to use the PeftModel, you need to pass a PeftConfig object to the SFTTrainer."
-                    f" and you passed a {type(peft_config)}."
-                )
-
-            if not isinstance(model, PeftModel):
-                _support_gc_kwargs = hasattr(
-                    args, "gradient_checkpointing_kwargs"
-                ) and "gradient_checkpointing_kwargs" in list(
-                    inspect.signature(prepare_model_for_kbit_training).parameters
-                )
-                gradient_checkpointing_kwargs = getattr(args, "gradient_checkpointing_kwargs", None) or {}
-                if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
-                    preprare_model_kwargs = {
-                        "use_gradient_checkpointing": getattr(args, "gradient_checkpointing", False)
-                    }
-
-                    if _support_gc_kwargs:
-                        preprare_model_kwargs["gradient_checkpointing_kwargs"] = gradient_checkpointing_kwargs
-
-                    model = prepare_model_for_kbit_training(model, **preprare_model_kwargs)
-
-                    if args is not None:
-                        args = dataclasses.replace(args, gradient_checkpointing=False)
-                elif getattr(args, "gradient_checkpointing", False) and (
-                    "use_reentrant" not in gradient_checkpointing_kwargs
-                    or gradient_checkpointing_kwargs["use_reentrant"]
-                ):
-                    # For backward compatibility with older versions of transformers
-                    if hasattr(model, "enable_input_require_grads"):
-                        model.enable_input_require_grads()
-                    else:
-
-                        def make_inputs_require_grad(module, input, output):
-                            output.requires_grad_(True)
-
-                        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-                model = get_peft_model(model, peft_config)
-                if args.bf16:
-                    model = model.to(torch.bfloat16)
-
-        if tokenizer is None:
-            tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
-            if getattr(tokenizer, "pad_token", None) is None:
-                tokenizer.pad_token = tokenizer.eos_token
-
-        if max_seq_length is None:
-            # to overcome some issues with broken tokenizers
-            max_seq_length = min(tokenizer.model_max_length, 1024)
-
-            warnings.warn(
-                f"You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to {max_seq_length}"
-            )
-
-        self.dataset_num_proc = dataset_num_proc
-        self.dataset_batch_size = dataset_batch_size
-
-        self._trainer_supports_neftune = hasattr(args, "neftune_noise_alpha")
-
-        if neftune_noise_alpha is not None and self._trainer_supports_neftune:
-            args.neftune_noise_alpha = neftune_noise_alpha
-            warnings.warn(
-                "You passed a `neftune_noise_alpha` argument to the SFTTrainer, the value you passed will override the one in the `TrainingArguments`."
-            )
-            # self.neftune_noise_alpha is done at Trainer level
-        elif not self._trainer_supports_neftune:
-            self.neftune_noise_alpha = neftune_noise_alpha
-
-        if not packing:
-            if dataset_text_field is None and formatting_func is None:
-                raise ValueError(
-                    "You passed `packing=False` to the SFTTrainer, but you didn't pass a `dataset_text_field` or `formatting_func` argument."
-                )
-
-            if data_collator is None:
-                data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-
-        if dataset_kwargs is None:
-            dataset_kwargs = {}
-        if train_dataset is not None:
-            train_dataset = self._prepare_dataset(
-                train_dataset,
-                tokenizer,
-                packing,
-                dataset_text_field,
-                max_seq_length,
-                formatting_func,
-                num_of_sequences,
-                chars_per_token,
-                **dataset_kwargs,
-            )
-        if eval_dataset is not None:
-            _multiple = isinstance(eval_dataset, dict)
-            _eval_datasets = eval_dataset if _multiple else {"singleton": eval_dataset}
-            for _eval_dataset_name, _eval_dataset in _eval_datasets.items():
-                _eval_datasets[_eval_dataset_name] = self._prepare_dataset(
-                    _eval_dataset,
-                    tokenizer,
-                    packing,
-                    dataset_text_field,
-                    max_seq_length,
-                    formatting_func,
-                    num_of_sequences,
-                    chars_per_token,
-                    **dataset_kwargs,
-                )
-            if not _multiple:
-                eval_dataset = _eval_datasets["singleton"]
-
-        if tokenizer.padding_side is not None and tokenizer.padding_side != "right":
-            warnings.warn(
-                "You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to "
-                "overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code."
-            )
-
-        GaudiTrainer.__init__(
-            self,
-            model=model,
-            args=args,
-            gaudi_config=gaudi_config,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            tokenizer=tokenizer,
-            model_init=model_init,
-            compute_metrics=compute_metrics,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
-
-        if self.args.max_steps > 0 and packing:
-            warnings.warn(
-                "You passed `packing=True` to the SFTTrainer, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached."
-            )
-            self.train_dataset.infinite = True
-        elif self.args.max_steps == -1 and packing:
-            self.train_dataset.infinite = False
diff --git a/optimum/habana/utils.py b/optimum/habana/utils.py
deleted file mode 100755
index 2bac05e0eb..0000000000
--- a/optimum/habana/utils.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import subprocess
-import time
-from typing import Any, Dict, List
-
-import numpy as np
-import torch
-from packaging import version
-from transformers.utils import is_torch_available
-
-from optimum.utils import logging
-
-from .version import __version__
-
-
-logger = logging.get_logger(__name__)
-
-
-CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.16.0")
-
-
-def to_device_dtype(my_input: Any, target_device: torch.device = None, target_dtype: torch.dtype = None):
-    """
-    Move a state_dict to the target device and convert it into target_dtype.
-
-    Args:
-        my_input : input to transform
-        target_device (torch.device, optional): target_device to move the input on. Defaults to None.
-        target_dtype (torch.dtype, optional): target dtype to convert the input into. Defaults to None.
-
-    Returns:
-        : transformed input
-    """
-    if isinstance(my_input, torch.Tensor):
-        if target_device is None:
-            target_device = my_input.device
-        if target_dtype is None:
-            target_dtype = my_input.dtype
-        return my_input.to(device=target_device, dtype=target_dtype)
-    elif isinstance(my_input, list):
-        return [to_device_dtype(i, target_device, target_dtype) for i in my_input]
-    elif isinstance(my_input, tuple):
-        return tuple(to_device_dtype(i, target_device, target_dtype) for i in my_input)
-    elif isinstance(my_input, dict):
-        return {k: to_device_dtype(v, target_device, target_dtype) for k, v in my_input.items()}
-    else:
-        return my_input
-
-
-def speed_metrics(
-    split: str,
-    start_time: float,
-    num_samples: int = None,
-    num_steps: int = None,
-    num_tokens: int = None,
-    start_time_after_warmup: float = None,
-    log_evaluate_save_time: float = None,
-) -> Dict[str, float]:
-    """
-    Measure and return speed performance metrics.
-
-    This function requires a time snapshot `start_time` before the operation to be measured starts and this function
-    should be run immediately after the operation to be measured has completed.
-
-    Args:
-        split (str): name to prefix metric (like train, eval, test...)
-        start_time (float): operation start time
-        num_samples (int, optional): number of samples processed. Defaults to None.
-        num_steps (int, optional): number of steps performed. Defaults to None.
-        num_tokens (int, optional): number of tokens processed. Defaults to None.
-        start_time_after_warmup (float, optional): time after warmup steps have been performed. Defaults to None.
-        log_evaluate_save_time (float, optional): time spent to log, evaluate and save. Defaults to None.
-
-    Returns:
-        Dict[str, float]: dictionary with performance metrics.
-    """
-
-    runtime = time.time() - start_time
-    result = {f"{split}_runtime": round(runtime, 4)}
-    if runtime == 0:
-        return result
-
-    # Adjust runtime if log_evaluate_save_time should not be included
-    if log_evaluate_save_time is not None:
-        runtime = runtime - log_evaluate_save_time
-
-    # Adjust runtime if there were warmup steps
-    if start_time_after_warmup is not None:
-        runtime = runtime + start_time - start_time_after_warmup
-
-    # Compute throughputs
-    if num_samples is not None:
-        samples_per_second = num_samples / runtime
-        result[f"{split}_samples_per_second"] = round(samples_per_second, 3)
-    if num_steps is not None:
-        steps_per_second = num_steps / runtime
-        result[f"{split}_steps_per_second"] = round(steps_per_second, 3)
-    if num_tokens is not None:
-        tokens_per_second = num_tokens / runtime
-        result[f"{split}_tokens_per_second"] = round(tokens_per_second, 3)
-
-    return result
-
-
-def warmup_inference_steps_time_adjustment(
-    start_time_after_warmup, start_time_after_inference_steps_warmup, num_inference_steps, warmup_steps
-):
-    """
-    Adjust start time after warmup to account for warmup inference steps.
-
-    When warmup is applied to multiple inference steps within a single sample generation we need to account for
-    skipped inference steps time to estimate "per sample generation time".  This function computes the average
-    inference time per step and adjusts the start time after warmup accordingly.
-
-    Args:
-        start_time_after_warmup: time after warmup steps have been performed
-        start_time_after_inference_steps_warmup: time after warmup inference steps have been performed
-        num_inference_steps: total number of inference steps per sample generation
-        warmup_steps: number of warmup steps
-
-    Returns:
-        [float]: adjusted start time after warmup which accounts for warmup inference steps based on average non-warmup steps time
-    """
-    if num_inference_steps > warmup_steps:
-        avg_time_per_inference_step = (time.time() - start_time_after_inference_steps_warmup) / (
-            num_inference_steps - warmup_steps
-        )
-        start_time_after_warmup -= avg_time_per_inference_step * warmup_steps
-    return start_time_after_warmup
-
-
-def to_gb_rounded(mem: float) -> float:
-    """
-    Rounds and converts to GB.
-
-    Args:
-        mem (float): memory in bytes
-
-    Returns:
-        float: memory in GB rounded to the second decimal
-    """
-    return np.round(mem / 1024**3, 2)
-
-
-def get_hpu_memory_stats(device=None) -> Dict[str, float]:
-    """
-    Returns memory stats of HPU as a dictionary:
-    - current memory allocated (GB)
-    - maximum memory allocated (GB)
-    - total memory available (GB)
-
-    Returns:
-        Dict[str, float]: memory stats.
-    """
-    from habana_frameworks.torch.hpu import memory_stats
-
-    mem_stats = memory_stats(device)
-
-    mem_dict = {
-        "memory_allocated (GB)": to_gb_rounded(mem_stats["InUse"]),
-        "max_memory_allocated (GB)": to_gb_rounded(mem_stats["MaxInUse"]),
-        "total_memory_available (GB)": to_gb_rounded(mem_stats["Limit"]),
-    }
-
-    return mem_dict
-
-
-def set_seed(seed: int):
-    """
-    Helper function for reproducible behavior to set the seed in `random`, `numpy` and `torch`.
-    Args:
-        seed (`int`): The seed to set.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    if is_torch_available():
-        from habana_frameworks.torch.hpu import random as hpu_random
-
-        torch.manual_seed(seed)
-        hpu_random.manual_seed_all(seed)
-
-
-def check_synapse_version():
-    """
-    Checks whether the versions of SynapseAI and drivers have been validated for the current version of Optimum Habana.
-    """
-    # Change the logging format
-    logging.enable_default_handler()
-    logging.enable_explicit_format()
-
-    # Check the version of habana_frameworks
-    habana_frameworks_version_number = get_habana_frameworks_version()
-    if (
-        habana_frameworks_version_number.major != CURRENTLY_VALIDATED_SYNAPSE_VERSION.major
-        or habana_frameworks_version_number.minor != CURRENTLY_VALIDATED_SYNAPSE_VERSION.minor
-    ):
-        logger.warning(
-            f"optimum-habana v{__version__} has been validated for SynapseAI v{CURRENTLY_VALIDATED_SYNAPSE_VERSION} but habana-frameworks v{habana_frameworks_version_number} was found, this could lead to undefined behavior!"
-        )
-
-    # Check driver version
-    driver_version = get_driver_version()
-    # This check is needed to make sure an error is not raised while building the documentation
-    # Because the doc is built on an instance that does not have `hl-smi`
-    if driver_version is not None:
-        if (
-            driver_version.major != CURRENTLY_VALIDATED_SYNAPSE_VERSION.major
-            or driver_version.minor != CURRENTLY_VALIDATED_SYNAPSE_VERSION.minor
-        ):
-            logger.warning(
-                f"optimum-habana v{__version__} has been validated for SynapseAI v{CURRENTLY_VALIDATED_SYNAPSE_VERSION} but the driver version is v{driver_version}, this could lead to undefined behavior!"
-            )
-    else:
-        logger.warning(
-            "Could not run `hl-smi`, please follow the installation guide: https://docs.habana.ai/en/latest/Installation_Guide/index.html."
-        )
-
-
-def get_habana_frameworks_version():
-    """
-    Returns the installed version of SynapseAI.
-    """
-    output = subprocess.run(
-        "pip list | grep habana-torch-plugin",
-        shell=True,
-        text=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
-    return version.parse(output.stdout.split("\n")[0].split()[-1])
-
-
-def get_driver_version():
-    """
-    Returns the driver version.
-    """
-    # Enable console printing for `hl-smi` check
-    output = subprocess.run(
-        "hl-smi", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"ENABLE_CONSOLE": "true"}
-    )
-    if output.returncode == 0 and output.stdout:
-        return version.parse(output.stdout.split("\n")[2].replace(" ", "").split(":")[1][:-1].split("-")[0])
-    return None
-
-
-class HabanaGenerationtime(object):
-    def __init__(self, iteration_times: List[float] = None):
-        self.iteration_times = iteration_times
-        self.start_time = 0
-        self.end_time = 0
-
-    def start(self):
-        self.start_time = time.perf_counter()
-
-    def step(self):
-        self.end_time = time.perf_counter()
-        self.iteration_times.append(self.end_time - self.start_time)
-        self.start_time = self.end_time
-
-
-class HabanaProfile(object):
-    """
-    HPU profiler only could be run once, so HABANA_PROFILE_ENABLED, a class static variable shared by all the instances of HabanaProfile, is used to control which part will be captured.
-    """
-
-    HABANA_PROFILE_ENABLED = True
-
-    def __init__(
-        self,
-        warmup: int = 0,
-        active: int = 0,
-        record_shapes: bool = True,
-        output_dir: str = "./hpu_profile",
-        wait: int = 0,
-    ):
-        if active <= 0 or warmup < 0 or not HabanaProfile.HABANA_PROFILE_ENABLED:
-
-            def noop():
-                pass
-
-            self.start = noop
-            self.stop = noop
-            self.step = noop
-        else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = False
-            schedule = torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1)
-            activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU]
-
-            profiler = torch.profiler.profile(
-                schedule=schedule,
-                activities=activities,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(output_dir),
-                record_shapes=record_shapes,
-                with_stack=False,
-            )
-            self.start = profiler.start
-            self.stop = profiler.stop
-            self.step = profiler.step
-            HabanaProfile.enable.invalid = True
-            HabanaProfile.disable.invalid = True
-
-    def stop(self):
-        self.stop()
-
-    def start(self):
-        self.start()
-
-    def step(self):
-        self.step()
-
-    @staticmethod
-    def disable():
-        """
-        Runs only once and must happen before doing profiling.
-        """
-        if hasattr(HabanaProfile.disable, "invalid"):
-            if not HabanaProfile.disable.invalid:
-                HabanaProfile.HABANA_PROFILE_ENABLED = False
-        else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = False
-
-    @staticmethod
-    def enable():
-        """
-        Runs only once and must happen before doing profiling.
-        """
-        if hasattr(HabanaProfile.enable, "invalid"):
-            if not HabanaProfile.enable.invalid:
-                HabanaProfile.HABANA_PROFILE_ENABLED = True
-        else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = True
-
-
-def check_optimum_habana_min_version(min_version):
-    """
-    Checks if the installed version of `optimum-habana` is larger than or equal to `min_version`.
-
-    Copied from: https://github.com/huggingface/transformers/blob/c41291965f078070c5c832412f5d4a5f633fcdc4/src/transformers/utils/__init__.py#L212
-    """
-    if version.parse(__version__) < version.parse(min_version):
-        error_message = (
-            f"This example requires `optimum-habana` to have a minimum version of {min_version},"
-            f" but the version found is {__version__}.\n"
-        )
-        if "dev" in min_version:
-            error_message += (
-                "You can install it from source with: "
-                "`pip install git+https://github.com/huggingface/optimum-habana.git`."
-            )
-        raise ImportError(error_message)
-
-
-def check_habana_frameworks_min_version(min_version):
-    """
-    Checks if the installed version of `habana_frameworks` is larger than or equal to `min_version`.
-    """
-    if get_habana_frameworks_version() < version.parse(min_version):
-        return False
-    else:
-        return True
-
-
-def check_habana_frameworks_version(req_version):
-    """
-    Checks if the installed version of `habana_frameworks` is equal to `req_version`.
-    """
-    return (get_habana_frameworks_version().major == version.parse(req_version).major) and (
-        get_habana_frameworks_version().minor == version.parse(req_version).minor
-    )
-
-
-def get_device_name():
-    """
-    Returns the name of the current device: Gaudi or Gaudi2.
-
-    Inspired from: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
-    """
-    import habana_frameworks.torch.utils.experimental as htexp
-
-    device_type = htexp._get_device_type()
-
-    if device_type == htexp.synDeviceType.synDeviceGaudi:
-        return "gaudi"
-    elif device_type == htexp.synDeviceType.synDeviceGaudi2:
-        return "gaudi2"
-    else:
-        raise ValueError(f"Unsupported device: the device type is {device_type}.")
-
-
-def is_gaudi3():
-    import habana_frameworks.torch as htorch
-    return htorch.hpu.get_device_name() == "GAUDI3"
diff --git a/optimum/habana/version.py b/optimum/habana/version.py
deleted file mode 100644
index fe4565bc6d..0000000000
--- a/optimum/habana/version.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__version__ = "1.12.0"
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index b7896da5e8..0000000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,43 +0,0 @@
-#  Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-[tool.ruff]
-line-length = 119
-
-[tool.ruff.lint]
-# Never enforce `E501` (line length violations).
-ignore = ["C901", "E501", "E741", "F402", "F823"]
-select = ["C", "E", "F", "I", "W"]
-exclude = ["text-generation-inference"]
-
-# Ignore import violations in all `__init__.py` files.
-[tool.ruff.lint.per-file-ignores]
-"__init__.py" = ["E402", "F401", "F403", "F811"]
-
-[tool.ruff.lint.isort]
-lines-after-imports = 2
-known-first-party = ["optimum.habana"]
-
-[tool.ruff.format]
-# Like Black, use double quotes for strings.
-quote-style = "double"
-
-# Like Black, indent with spaces, rather than tabs.
-indent-style = "space"
-
-# Like Black, respect magic trailing commas.
-skip-magic-trailing-comma = false
-
-# Like Black, automatically detect the appropriate line ending.
-line-ending = "auto"
diff --git a/readme_logo_dark.png b/readme_logo_dark.png
deleted file mode 100644
index 7923c7c5c2..0000000000
Binary files a/readme_logo_dark.png and /dev/null differ
diff --git a/readme_logo_light.png b/readme_logo_light.png
deleted file mode 100644
index 717ffc57df..0000000000
Binary files a/readme_logo_light.png and /dev/null differ
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 5f47c5c6be..0000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-[tool:pytest]
-doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
diff --git a/setup.py b/setup.py
deleted file mode 100644
index e8585ff7f6..0000000000
--- a/setup.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-
-from setuptools import find_namespace_packages, setup
-
-
-# Ensure we match the version set in optimum/habana/version.py
-try:
-    filepath = "optimum/habana/version.py"
-    with open(filepath) as version_file:
-        (__version__,) = re.findall('__version__ = "(.*)"', version_file.read())
-except Exception as error:
-    assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
-
-
-INSTALL_REQUIRES = [
-    "transformers >= 4.40.0, < 4.41.0",
-    "optimum",
-    "torch",
-    "accelerate < 0.28.0",
-    "diffusers >= 0.26.0, < 0.27.0",
-    "huggingface_hub < 0.23.0",
-    "datasets < 2.20.0",
-    "pytest < 8.0.0",
-]
-
-TESTS_REQUIRE = [
-    "psutil",
-    "parameterized",
-    "GitPython",
-    "optuna",
-    "sentencepiece",
-    "datasets",
-    "safetensors",
-    "pytest < 8.0.0",
-]
-
-QUALITY_REQUIRES = [
-    "ruff",
-    "hf_doc_builder",
-]
-
-EXTRAS_REQUIRE = {
-    "tests": TESTS_REQUIRE,
-    "quality": QUALITY_REQUIRES,
-}
-
-setup(
-    name="optimum-habana",
-    version=__version__,
-    description=(
-        "Optimum Habana is the interface between the Hugging Face Transformers and Diffusers libraries and Habana's"
-        " Gaudi processor (HPU). It provides a set of tools enabling easy model loading, training and inference on"
-        " single- and multi-HPU settings for different downstream tasks."
-    ),
-    long_description=open("README.md", "r", encoding="utf-8").read(),
-    long_description_content_type="text/markdown",
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "License :: OSI Approved :: Apache Software License",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Education",
-        "Intended Audience :: Science/Research",
-        "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    ],
-    keywords="transformers, diffusers, mixed-precision training, fine-tuning, gaudi, hpu",
-    url="https://huggingface.co/hardware/habana",
-    author="HuggingFace Inc. Special Ops Team",
-    author_email="hardware@huggingface.co",
-    license="Apache",
-    packages=find_namespace_packages(include=["optimum*"]),
-    install_requires=INSTALL_REQUIRES,
-    extras_require=EXTRAS_REQUIRE,
-    include_package_data=True,
-    zip_safe=False,
-)
diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/baselines/CodeLlama_13b_Instruct_hf.json b/tests/baselines/CodeLlama_13b_Instruct_hf.json
deleted file mode 100644
index 93e77ee21c..0000000000
--- a/tests/baselines/CodeLlama_13b_Instruct_hf.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-    "gaudi2": {
-        "wikitext": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 48,
-            "distribution": {
-                "deepspeed": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 48,
-                    "train_runtime": 371.0852,
-                    "train_samples_per_second": 19.243,
-                    "perplexity": 6.982,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--gradient_checkpointing",
-                        "--use_hpu_graphs_for_inference",
-                        "--deepspeed tests/configs/deepspeed_zero_1.json"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/LlamaGuard_7b.json b/tests/baselines/LlamaGuard_7b.json
deleted file mode 100644
index a94b1988d1..0000000000
--- a/tests/baselines/LlamaGuard_7b.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-    "gaudi2": {
-        "mrpc": {
-            "num_train_epochs": 3,
-            "eval_batch_size": 8,
-            "distribution": {
-                "deepspeed": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 32,
-                    "eval_f1": 0.8726,
-                    "train_runtime": 55.8644,
-                    "train_samples_per_second": 349.869,
-                    "extra_arguments": [
-                        "--max_seq_length 128",
-                        "--add_pad_token True",
-                        "--use_hpu_graphs_for_inference",
-                        "--deepspeed tests/configs/deepspeed_zero_2.json"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/albert_large_v2.json b/tests/baselines/albert_large_v2.json
deleted file mode 100644
index 2f13722a95..0000000000
--- a/tests/baselines/albert_large_v2.json
+++ /dev/null
@@ -1,62 +0,0 @@
-{
-    "gaudi": {
-        "squad": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 4,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 6e-5,
-                    "train_batch_size": 32,
-                    "eval_f1": 91.8679,
-                    "train_runtime": 2900.5518,
-                    "train_samples_per_second": 62.298,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 6e-5,
-                    "train_batch_size": 32,
-                    "eval_f1": 92.7647,
-                    "train_runtime": 464.9893,
-                    "train_samples_per_second": 494.936,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "squad": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 4,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 6e-5,
-                    "train_batch_size": 128,
-                    "eval_f1": 92.4235,
-                    "train_runtime": 571.138,
-                    "train_samples_per_second": 321.635,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 7e-5,
-                    "train_batch_size": 128,
-                    "eval_f1": 92.2111,
-                    "train_runtime": 115.15,
-                    "train_samples_per_second": 2464.403,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/albert_xxlarge_v1.json b/tests/baselines/albert_xxlarge_v1.json
deleted file mode 100644
index 8efe5d729d..0000000000
--- a/tests/baselines/albert_xxlarge_v1.json
+++ /dev/null
@@ -1,62 +0,0 @@
-{
-    "gaudi": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 2,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 1e-5,
-                    "train_batch_size": 12,
-                    "eval_f1": 94.8803,
-                    "train_runtime": 9374.7909,
-                    "train_samples_per_second": 9.523,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 12,
-                    "eval_f1": 95.1221,
-                    "train_runtime": 1312.9496,
-                    "train_samples_per_second": 75.51,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 2,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 2e-5,
-                    "train_batch_size": 16,
-                    "eval_f1": 95.0726,
-                    "train_runtime": 1747.4836,
-                    "train_samples_per_second": 51.055,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 7e-5,
-                    "train_batch_size": 16,
-                    "eval_f1": 95.1227,
-                    "train_runtime": 221.2125,
-                    "train_samples_per_second": 439.114,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/ast_finetuned_speech_commands_v2.json b/tests/baselines/ast_finetuned_speech_commands_v2.json
deleted file mode 100644
index 7341f2d68b..0000000000
--- a/tests/baselines/ast_finetuned_speech_commands_v2.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "gaudi2": {
-        "common_language": {
-            "num_train_epochs": 10,
-            "eval_batch_size": 64,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-4,
-                    "train_batch_size": 32,
-                    "eval_accuracy": 0.1871,
-                    "train_runtime": 139.9477,
-                    "train_samples_per_second": 1955.74,
-                    "eval_samples_per_second": 2301.088,
-                    "extra_arguments": [
-                        "--audio_column_name audio",
-                        "--label_column_name language",
-                        "--remove_unused_columns False",
-                        "--max_length_seconds 8",
-                        "--attention_mask False",
-                        "--warmup_ratio 0.1",
-                        "--seed 0",
-                        "--dataloader_num_workers 1",
-                        "--ignore_mismatched_sizes=True",
-                        "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    }
-}
diff --git a/tests/baselines/bert_base_uncased.json b/tests/baselines/bert_base_uncased.json
deleted file mode 100644
index 18bcf59170..0000000000
--- a/tests/baselines/bert_base_uncased.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-    "squad": {
-        "num_train_epochs": 1,
-        "eval_batch_size": 8,
-        "distribution": {
-            "single_card": {
-                "learning_rate": 5e-5,
-                "train_batch_size": 24,
-                "eval_f1": 87.3749,
-                "train_runtime": 568.832,
-                "train_samples_per_second": 158.687,
-                "extra_arguments": [
-                    "--max_seq_length 384",
-                    "--use_hpu_graphs_for_inference"
-                ]
-            },
-            "multi_card": {
-                "learning_rate": 2e-4,
-                "train_batch_size": 24,
-                "eval_f1": 87.6017,
-                "train_runtime": 97.7157,
-                "train_samples_per_second": 1240.638,
-                "extra_arguments": [
-                    "--max_seq_length 384",
-                    "--use_hpu_graphs_for_inference"
-                ]
-            }
-        }
-    },
-    "mrpc": {
-        "num_train_epochs": 3,
-        "eval_batch_size": 8,
-        "distribution": {
-            "single_card": {
-                "learning_rate": 6e-5,
-                "train_batch_size": 64,
-                "eval_f1": 0.8998,
-                "train_runtime": 31.044,
-                "train_samples_per_second": 558.201,
-                "extra_arguments": [
-                    "--max_seq_length 128",
-                    "--use_hpu_graphs_for_inference"
-                ]
-            },
-            "multi_card": {
-                "learning_rate": 5e-4,
-                "train_batch_size": 64,
-                "eval_f1": 0.8765,
-                "train_runtime": 28.3865,
-                "train_samples_per_second": 3643.715,
-                "extra_arguments": [
-                    "--max_seq_length 128",
-                    "--use_hpu_graphs_for_inference"
-                ]
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/baselines/bert_large_uncased_whole_word_masking.json
deleted file mode 100755
index 9bfd3a1f89..0000000000
--- a/tests/baselines/bert_large_uncased_whole_word_masking.json
+++ /dev/null
@@ -1,118 +0,0 @@
-{
-    "gaudi": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 24,
-                    "eval_f1": 93.1962,
-                    "train_runtime": 1678.3456,
-                    "train_samples_per_second": 54.101,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 7e-5,
-                    "train_batch_size": 24,
-                    "eval_f1": 93.1869,
-                    "train_runtime": 309.9553,
-                    "train_samples_per_second": 398.459,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        },
-        "mrpc": {
-            "num_train_epochs": 3,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 32,
-                    "eval_f1": 0.9022,
-                    "train_runtime": 90.3943,
-                    "train_samples_per_second": 172.792,
-                    "extra_arguments": [
-                        "--max_seq_length 128",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 16,
-                    "eval_f1": 0.8897,
-                    "train_runtime": 65.644,
-                    "train_samples_per_second": 919.623,
-                    "extra_arguments": [
-                        "--max_seq_length 128",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 4e-5,
-                    "train_batch_size": 32,
-                    "eval_f1": 93.2753,
-                    "train_runtime": 342.1722,
-                    "train_samples_per_second": 286.435,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 8e-5,
-                    "train_batch_size": 32,
-                    "eval_f1": 92.6726,
-                    "train_runtime": 77.307,
-                    "train_samples_per_second": 2150.333,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        },
-        "mrpc": {
-            "num_train_epochs": 3,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 9e-5,
-                    "train_batch_size": 256,
-                    "eval_f1": 0.8998,
-                    "train_runtime": 33.2909,
-                    "train_samples_per_second": 1151.598,
-                    "extra_arguments": [
-                        "--max_seq_length 128",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 40,
-                    "eval_f1": 0.8758,
-                    "train_runtime": 41.4282,
-                    "train_samples_per_second": 2771.405,
-                    "extra_arguments": [
-                        "--max_seq_length 128",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/bloom_7b1.json b/tests/baselines/bloom_7b1.json
deleted file mode 100644
index 37251e8651..0000000000
--- a/tests/baselines/bloom_7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-    "gaudi": {
-        "wikitext": {
-            "num_train_epochs": 3,
-            "eval_batch_size": 4,
-            "distribution": {
-                "deepspeed": {
-                    "learning_rate": 1e-4,
-                    "train_batch_size": 8,
-                    "train_runtime": 1556.481,
-                    "train_samples_per_second": 4.757,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_cache False",
-                        "--gradient_checkpointing",
-                        "--save_strategy no",
-                        "--deepspeed tests/configs/deepspeed_zero_3_gaudi1.json"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/bridgetower_large_itm_mlm_itc.json b/tests/baselines/bridgetower_large_itm_mlm_itc.json
deleted file mode 100644
index e188228256..0000000000
--- a/tests/baselines/bridgetower_large_itm_mlm_itc.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-    "gaudi2": {
-        "jmhessel/newyorker_caption_contest": {
-            "num_train_epochs": 5,
-            "eval_batch_size": 16,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 1e-5,
-                    "train_batch_size": 48,
-                    "train_runtime": 314.5877,
-                    "train_samples_per_second": 918.387,
-                    "extra_arguments": [
-                        "--dataset_config_name matching",
-                        "--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6",
-                        "--image_column image",
-                        "--caption_column image_description",
-                        "--remove_unused_columns False",
-                        "--mediapipe_dataloader",
-                        "--dataloader_num_workers 2",
-                        "--logging_steps 10",
-                        "--use_hpu_graphs_for_inference",
-                        "--distribution_strategy fast_ddp"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/clip_roberta.json b/tests/baselines/clip_roberta.json
deleted file mode 100755
index e2b14ba68e..0000000000
--- a/tests/baselines/clip_roberta.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-    "gaudi": {
-        "ydshieh/coco_dataset_script": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 64,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 64,
-                    "train_runtime": 314.7726,
-                    "train_samples_per_second": 2560.999,
-                    "extra_arguments": [
-                        "--data_dir $PWD/",
-                        "--dataset_config_name 2017",
-                        "--image_column image_path",
-                        "--caption_column caption",
-                        "--remove_unused_columns False",
-                        "--warmup_steps 0",
-                        "--weight_decay 0.1",
-                        "--save_strategy epoch",
-                        "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference",
-                        "--dataloader_num_workers 16"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "ydshieh/coco_dataset_script": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 64,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 512,
-                    "train_runtime": 63.36,
-                    "train_samples_per_second": 18434.069,
-                    "extra_arguments": [
-                        "--data_dir $PWD/",
-                        "--dataset_config_name 2017",
-                        "--image_column image_path",
-                        "--caption_column caption",
-                        "--remove_unused_columns False",
-                        "--warmup_steps 0",
-                        "--weight_decay 0.1",
-                        "--save_strategy epoch",
-                        "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference",
-                        "--dataloader_num_workers 16",
-                        "--distribution_strategy fast_ddp",
-                        "--mediapipe_dataloader"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/distilbert_base_uncased.json b/tests/baselines/distilbert_base_uncased.json
deleted file mode 100644
index 00482ebeea..0000000000
--- a/tests/baselines/distilbert_base_uncased.json
+++ /dev/null
@@ -1,62 +0,0 @@
-{
-    "gaudi": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 1e-4,
-                    "train_batch_size": 48,
-                    "eval_f1": 84.5384,
-                    "train_runtime": 264.3669,
-                    "train_samples_per_second": 344.126,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 4e-4,
-                    "train_batch_size": 48,
-                    "eval_f1": 83.0667,
-                    "train_runtime": 54.5344,
-                    "train_samples_per_second": 2503.657,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "squad": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 2e-4,
-                    "train_batch_size": 64,
-                    "eval_f1": 84.5418,
-                    "train_runtime": 117.8054,
-                    "train_samples_per_second": 1547.185,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 3e-4,
-                    "train_batch_size": 64,
-                    "eval_f1": 83.2233,
-                    "train_runtime": 24.0441,
-                    "train_samples_per_second": 11144.651,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/falcon_40b.json b/tests/baselines/falcon_40b.json
deleted file mode 100644
index cb08dc4ed4..0000000000
--- a/tests/baselines/falcon_40b.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-    "gaudi2": {
-        "timdettmers/openassistant-guanaco": {
-            "num_train_epochs": 3,
-            "eval_batch_size": 1,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 4e-4,
-                    "train_batch_size": 1,
-                    "perplexity": 4.0893,
-                    "train_runtime": 931.1213,
-                    "train_samples_per_second": 28.162,
-                    "extra_arguments": [
-                        "--bf16",
-                        "--gradient_accumulation_steps 16",
-                        "--evaluation_strategy no",
-                        "--save_strategy no",
-                        "--warmup_ratio  0.03",
-                        "--lr_scheduler_type constant",
-                        "--max_grad_norm  0.3",
-                        "--logging_steps 1",
-                        "--use_hpu_graphs_for_inference",
-                        "--lora_rank 64",
-                        "--lora_alpha 16",
-                        "--lora_dropout 0.1",
-                        "--lora_target_modules query_key_value dense dense_h_to_4h dense_4h_to_h",
-                        "--dataset_concatenation",
-                        "--max_seq_length 256",
-                        "--low_cpu_mem_usage True",
-                        "--adam_epsilon 1e-08",
-                        "--ddp_bucket_cap_mb 50",
-                        "--pipelining_fwd_bwd",
-                        "--validation_split_percentage 10"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/flan_t5_xxl.json b/tests/baselines/flan_t5_xxl.json
deleted file mode 100644
index 779bc9fd83..0000000000
--- a/tests/baselines/flan_t5_xxl.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "gaudi2": {
-        "cnn_dailymail": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 22,
-            "distribution": {
-                "deepspeed": {
-                    "learning_rate": 1e-4,
-                    "train_batch_size": 22,
-                    "eval_rougeLsum": 0.1429,
-                    "train_runtime": 89.486,
-                    "train_samples_per_second": 27.299,
-                    "extra_arguments": [
-                        "--max_steps 10",
-                        "--max_eval_samples 880",
-                        "--dataset_config 3.0.0",
-                        "--source_prefix summarize: ",
-                        "--predict_with_generate",
-                        "--ignore_pad_token_for_loss False",
-                        "--pad_to_max_length",
-                        "--generation_max_length 129",
-                        "--gradient_checkpointing",
-                        "--adam_epsilon 1e-08",
-                        "--deepspeed examples/summarization/ds_flan_t5_z3_config_bf16.json"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/gpt2.json b/tests/baselines/gpt2.json
deleted file mode 100644
index 889bdbd3d4..0000000000
--- a/tests/baselines/gpt2.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-    "gaudi": {
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 4,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 4,
-                    "perplexity": 22.2751,
-                    "train_runtime": 225.2898,
-                    "train_samples_per_second": 21.308,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference",
-                        "--gradient_checkpointing"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 4e-4,
-                    "train_batch_size": 4,
-                    "perplexity": 22.2699,
-                    "train_runtime": 68.9627,
-                    "train_samples_per_second": 156.241,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference",
-                        "--gradient_checkpointing"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 4,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 2e-4,
-                    "train_batch_size": 16,
-                    "perplexity": 21.0729,
-                    "train_runtime": 43.9361,
-                    "train_samples_per_second": 130.785,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 8e-4,
-                    "train_batch_size": 16,
-                    "perplexity": 21.7858,
-                    "train_runtime": 23.8993,
-                    "train_samples_per_second": 939.24,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/gpt2_xl.json b/tests/baselines/gpt2_xl.json
deleted file mode 100644
index ffd92331cb..0000000000
--- a/tests/baselines/gpt2_xl.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-    "gaudi": {
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 4,
-            "distribution": {
-                "deepspeed": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 2,
-                    "perplexity": 12.6744,
-                    "train_runtime": 366.8694,
-                    "train_samples_per_second": 16.464,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference",
-                        "--deepspeed tests/configs/deepspeed_zero_2.json"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 4,
-            "distribution": {
-                "deepspeed": {
-                    "learning_rate": 4e-4,
-                    "train_batch_size": 16,
-                    "perplexity": 13.0461,
-                    "train_runtime": 190.696,
-                    "train_samples_per_second": 89.877,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--gradient_checkpointing",
-                        "--use_hpu_graphs_for_inference",
-                        "--deepspeed tests/configs/deepspeed_zero_2.json"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/gpt_neox_20b.json b/tests/baselines/gpt_neox_20b.json
deleted file mode 100644
index 61b27156bf..0000000000
--- a/tests/baselines/gpt_neox_20b.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-    "gaudi2": {
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 2,
-            "distribution": {
-                "deepspeed": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 2,
-                    "perplexity": 8.0545,
-                    "train_runtime": 721.5428,
-                    "train_samples_per_second": 7.571,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--gradient_checkpointing",
-                        "--use_hpu_graphs_for_inference",
-                        "--deepspeed tests/configs/deepspeed_zero_2.json"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/llama_7b.json b/tests/baselines/llama_7b.json
deleted file mode 100644
index 6f05689928..0000000000
--- a/tests/baselines/llama_7b.json
+++ /dev/null
@@ -1,280 +0,0 @@
-{
-    "gaudi": {
-        "tatsu-lab/alpaca": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 2,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 1e-4,
-                    "train_batch_size": 2,
-                    "perplexity": 2.7542,
-                    "train_runtime": 538.0159,
-                    "train_samples_per_second": 20.397,
-                    "extra_arguments": [
-                        "--bf16",
-                        "--gradient_accumulation_steps 4",
-                        "--save_strategy no",
-                        "--use_hpu_graphs_for_inference",
-                        "--dataset_concatenation",
-                        "--validation_split_percentage 10",
-                        "--max_steps 100",
-                        "--attn_softmax_bf16"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "tatsu-lab/alpaca": {
-            "num_train_epochs": 3,
-            "eval_batch_size": 4,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 3e-4,
-                    "train_batch_size": 8,
-                    "perplexity": 2.3665,
-                    "train_runtime": 294.5707,
-                    "train_samples_per_second": 148.093,
-                    "extra_arguments": [
-                        "--bf16",
-                        "--gradient_accumulation_steps 2",
-                        "--evaluation_strategy no",
-                        "--save_strategy no",
-                        "--warmup_ratio  0.03",
-                        "--lr_scheduler_type constant",
-                        "--max_grad_norm  0.3",
-                        "--logging_steps 1",
-                        "--use_hpu_graphs_for_inference",
-                        "--lora_rank 8",
-                        "--lora_alpha 16",
-                        "--lora_dropout 0.05",
-                        "--lora_target_modules q_proj v_proj",
-                        "--dataset_concatenation",
-                        "--max_seq_length 512",
-                        "--low_cpu_mem_usage True",
-                        "--adam_epsilon 1e-08",
-                        "--ddp_bucket_cap_mb 50",
-                        "--validation_split_percentage 10",
-                        "--attn_softmax_bf16"
-                    ]
-                }
-            }
-        },
-        "tatsu-lab/alpaca_fsdpcompile": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 1,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 3e-4,
-                    "train_batch_size": 8,
-                    "perplexity": 2.4259,
-                    "train_runtime": 186.2483,
-                    "train_samples_per_second": 93.5,
-                    "extra_arguments": [
-                        "--bf16 True",
-                        "--gradient_accumulation_steps 2",
-                        "--evaluation_strategy no",
-                        "--save_strategy no",
-                        "--warmup_ratio  0.03",
-                        "--lr_scheduler_type constant",
-                        "--max_grad_norm  0.3",
-                        "--logging_steps 1",
-                        "--lora_rank 8",
-                        "--lora_alpha 16",
-                        "--lora_dropout 0.05",
-                        "--lora_target_modules q_proj v_proj",
-                        "--dataset_concatenation",
-                        "--max_seq_length 512",
-                        "--low_cpu_mem_usage True",
-                        "--adam_epsilon 1e-08",
-                        "--ddp_bucket_cap_mb 50",
-                        "--validation_split_percentage 10",
-                        "--attn_softmax_bf16",
-                        "--pipelining_fwd_bwd False",
-                        "--fsdp auto_wrap",
-                        "--torch_compile_backend hpu_backend",
-                        "--torch_compile",
-                        "--fsdp_config examples/language-modeling/fsdp_config.json"
-                    ]
-                }
-            }
-        },
-        "tatsu-lab/alpaca_fp8": {
-            "num_train_epochs": 3,
-            "eval_batch_size": 4,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 3e-4,
-                    "train_batch_size": 16,
-                    "perplexity": 2.3692,
-                    "train_runtime": 411.9935,
-                    "train_samples_per_second": 232.439,
-                    "extra_arguments": [
-                        "--bf16",
-                        "--gradient_accumulation_steps 1",
-                        "--evaluation_strategy no",
-                        "--save_strategy no",
-                        "--warmup_ratio  0.03",
-                        "--lr_scheduler_type constant",
-                        "--logging_steps 40",
-                        "--lora_rank 8",
-                        "--lora_alpha 16",
-                        "--lora_dropout 0.05",
-                        "--lora_target_modules q_proj v_proj",
-                        "--dataset_concatenation",
-                        "--max_seq_length 512",
-                        "--low_cpu_mem_usage True",
-                        "--adam_epsilon 1e-08",
-                        "--ddp_bucket_cap_mb 50",
-                        "--validation_split_percentage 10",
-                        "--pipelining_fwd_bwd",
-                        "--throughput_warmup_steps 18",
-                        "--use_lazy_mode",
-                        "--max_grad_norm 0.3",
-                        "--fp8"
-                    ]
-                }
-            }
-        },
-        "trl-sft": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 1,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 1e-4,
-                    "train_batch_size": 4,
-                    "train_runtime": 206,
-                    "train_samples_per_second": 51.54,
-                    "extra_arguments": [
-                        "--bf16 True",
-                        "--gradient_accumulation_steps 2",
-                        "--evaluation_strategy no",
-                        "--save_strategy no",
-                        "--warmup_ratio  0.03",
-                        "--lr_scheduler_type constant",
-                        "--max_grad_norm  0.3",
-                        "--logging_steps 1",
-                        "--lora_r 8",
-                        "--lora_alpha 16",
-                        "--lora_dropout 0.05",
-                        "--lora_target_modules q_proj v_proj",
-                        "--seq_length 1024",
-                        "--optim paged_adamw_32bit",
-                        "--weight_decay 0.05",
-                        "--report_to none",
-                        "--max_steps 100"
-                    ]
-                }
-            }
-        },
-        "trl-dpo": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 1,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-4,
-                    "train_batch_size": 1,
-                    "train_runtime": 234.6471,
-                    "train_samples_per_second": 13.499,
-                    "extra_arguments": [
-                        "--logging_steps 1",
-                        "--lora_r 8",
-                        "--lora_alpha 16",
-                        "--lora_dropout 0.05",
-                        "--lora_target_modules q_proj v_proj k_proj out_proj fc_in fc_out wte",
-                        "--max_length 1024",
-                        "--max_prompt_length 512",
-                        "--report_to none",
-                        "--max_steps 100",
-                        "--eval_steps 200",
-                        "--lr_scheduler_type cosine",
-                        "--warmup_steps 0",
-                        "--weight_decay 0.05",
-                        "--optimizer_type paged_adamw_32bit",
-                        "--beta 0.1",
-                        "--gradient_accumulation_steps 4",
-                        "--sanity_check"
-                    ]
-                }
-            }
-        },
-        "prompt-tuning": {
-            "num_train_epochs": 20,
-            "eval_batch_size": 1,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-4,
-                    "train_batch_size": 1,
-                    "train_runtime": 16.5,
-                    "train_samples_per_second": 63.161,
-                    "perplexity": 1.224,
-                    "extra_arguments": [
-                        "--num_virtual_tokens 8",
-                        "--max_seq_length 64",
-                        "--logging_steps 1",
-                        "--report_to none",
-                        "--max_steps 100",
-                        "--peft_type prompt_tuning",
-                        "--max_seq_length 64",
-                        "--lr_scheduler_type cosine",
-                        "--warmup_steps 0",
-                        "--weight_decay 0.05",
-                        "--gradient_accumulation_steps 1"
-                    ]
-                }
-            }
-        },
-        "prefix-tuning": {
-            "num_train_epochs": 20,
-            "eval_batch_size": 1,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-4,
-                    "train_batch_size": 1,
-                    "train_runtime": 16.5,
-                    "train_samples_per_second": 63.161,
-                    "perplexity": 1.224,
-                    "extra_arguments": [
-                        "--num_virtual_tokens 8",
-                        "--max_seq_length 64",
-                        "--logging_steps 1",
-                        "--report_to none",
-                        "--max_steps 100",
-                        "--peft_type prompt_tuning",
-                        "--max_seq_length 64",
-                        "--lr_scheduler_type cosine",
-                        "--warmup_steps 0",
-                        "--weight_decay 0.05",
-                        "--gradient_accumulation_steps 1"
-                    ]
-                }
-            }
-        },
-        "p-tuning": {
-            "num_train_epochs": 20,
-            "eval_batch_size": 1,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-4,
-                    "train_batch_size": 1,
-                    "train_runtime": 16.5,
-                    "train_samples_per_second": 63.161,
-                    "perplexity": 1.224,
-                    "extra_arguments": [
-                        "--num_virtual_tokens 8",
-                        "--max_seq_length 64",
-                        "--logging_steps 1",
-                        "--report_to none",
-                        "--max_steps 100",
-                        "--peft_type prompt_tuning",
-                        "--max_seq_length 64",
-                        "--lr_scheduler_type cosine",
-                        "--warmup_steps 0",
-                        "--weight_decay 0.05",
-                        "--gradient_accumulation_steps 1"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/protst_esm1b_for_sequential_classification.json b/tests/baselines/protst_esm1b_for_sequential_classification.json
deleted file mode 100644
index c6e2cb2a76..0000000000
--- a/tests/baselines/protst_esm1b_for_sequential_classification.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "gaudi2": {
-        "prost-sequence-classification": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 4,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 32,
-                    "train_runtime": 32.4244,
-                    "train_samples_per_second": 750,
-		    "eval_accuracy":0.5430884904569115,
-                    "extra_arguments": [
-                        "--save_strategy no",
-			"--tokenizer_name facebook/esm1b_t33_650M_UR50S",
-			"--use_hpu_graphs_for_inference",
-                        "--use_hpu_graphs_for_training",
-			"--trust_remote_code",
-			"--torch_dtype bfloat16",
-			"--label_names labels"
-		    ]
-                }
-            }
-        }
-    }
-}
diff --git a/tests/baselines/roberta_base.json b/tests/baselines/roberta_base.json
deleted file mode 100644
index 210f608d27..0000000000
--- a/tests/baselines/roberta_base.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-    "gaudi": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 12,
-                    "eval_f1": 91.9903,
-                    "train_runtime": 599.9343,
-                    "train_samples_per_second": 149.781,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 8e-5,
-                    "train_batch_size": 12,
-                    "eval_f1": 91.624,
-                    "train_runtime": 103.5987,
-                    "train_samples_per_second": 1083.304,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        },
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 8,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 24,
-                    "perplexity": 3.6338,
-                    "train_runtime": 43.1541,
-                    "train_samples_per_second": 554.787,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference",
-                        "--ddp_find_unused_parameters True"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 7e-5,
-                    "train_batch_size": 64,
-                    "eval_f1": 91.5253,
-                    "train_runtime": 105.6042,
-                    "train_samples_per_second": 907.395,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 2e-4,
-                    "train_batch_size": 64,
-                    "eval_f1": 90.8766,
-                    "train_runtime": 32.2213,
-                    "train_samples_per_second": 6568.625,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        },
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 8,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 8e-5,
-                    "train_batch_size": 32,
-                    "perplexity": 3.6691,
-                    "train_runtime": 12.3633,
-                    "train_samples_per_second": 2758.371,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference",
-                        "--ddp_find_unused_parameters True"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/roberta_large.json b/tests/baselines/roberta_large.json
deleted file mode 100755
index d5ffc8200b..0000000000
--- a/tests/baselines/roberta_large.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-    "gaudi": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 12,
-                    "eval_f1": 94.2959,
-                    "train_runtime": 1771.3319,
-                    "train_samples_per_second": 50.815,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 8e-5,
-                    "train_batch_size": 12,
-                    "eval_f1": 94.2867,
-                    "train_runtime": 304.9084,
-                    "train_samples_per_second": 366.177,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        },
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 8,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 8,
-                    "perplexity": 2.7851,
-                    "train_runtime": 75.0033,
-                    "train_samples_per_second": 217.752,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference",
-                        "--ddp_find_unused_parameters True"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "squad": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 8,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 32,
-                    "eval_f1": 94.5886,
-                    "train_runtime": 342.1653,
-                    "train_samples_per_second": 284.873,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 7e-5,
-                    "train_batch_size": 32,
-                    "eval_f1": 94.09,
-                    "train_runtime": 77.333,
-                    "train_samples_per_second": 2138.366,
-                    "extra_arguments": [
-                        "--max_seq_length 384",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        },
-        "wikitext": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 8,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 7e-5,
-                    "train_batch_size": 16,
-                    "perplexity": 2.829,
-                    "train_runtime": 25.6323,
-                    "train_samples_per_second": 1183.796,
-                    "extra_arguments": [
-                        "--dataset_config_name wikitext-2-raw-v1",
-                        "--use_hpu_graphs_for_inference",
-                        "--ddp_find_unused_parameters True"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/swin_base_patch4_window7_224_in22k.json b/tests/baselines/swin_base_patch4_window7_224_in22k.json
deleted file mode 100644
index b6c09b6dec..0000000000
--- a/tests/baselines/swin_base_patch4_window7_224_in22k.json
+++ /dev/null
@@ -1,86 +0,0 @@
-{
-    "gaudi": {
-        "cifar10": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 64,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 3e-5,
-                    "train_batch_size": 64,
-                    "eval_accuracy": 0.9871,
-                    "train_runtime": 246.4134,
-                    "train_samples_per_second": 212.722,
-                    "extra_arguments": [
-                        "--remove_unused_columns False",
-                        "--image_column_name img",
-                        "--seed 1337",
-                        "--use_hpu_graphs_for_inference",
-                        "--ignore_mismatched_sizes",
-                        "--dataloader_num_workers 1",
-                        "--pipelining_fwd_bwd True",
-                        "--non_blocking_data_copy True"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 2e-4,
-                    "train_batch_size": 64,
-                    "eval_accuracy": 0.9819,
-                    "train_runtime": 117.6424,
-                    "train_samples_per_second": 1683.344,
-                    "extra_arguments": [
-                        "--remove_unused_columns False",
-                        "--image_column_name img",
-                        "--seed 1337",
-                        "--use_hpu_graphs_for_inference",
-                        "--ignore_mismatched_sizes",
-                        "--dataloader_num_workers 1",
-                        "--pipelining_fwd_bwd True",
-                        "--non_blocking_data_copy True"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "cifar10": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 64,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 6e-5,
-                    "train_batch_size": 160,
-                    "eval_accuracy": 0.9852,
-                    "train_runtime": 73.5918,
-                    "train_samples_per_second": 957.491,
-                    "extra_arguments": [
-                        "--remove_unused_columns False",
-                        "--image_column_name img",
-                        "--seed 1337",
-                        "--use_hpu_graphs_for_inference",
-                        "--ignore_mismatched_sizes",
-                        "--dataloader_num_workers 1",
-                        "--pipelining_fwd_bwd True",
-                        "--non_blocking_data_copy True"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 2e-4,
-                    "train_batch_size": 160,
-                    "eval_accuracy": 0.9821,
-                    "train_runtime": 62.9986,
-                    "train_samples_per_second": 6202.525,
-                    "extra_arguments": [
-                        "--remove_unused_columns False",
-                        "--image_column_name img",
-                        "--seed 1337",
-                        "--use_hpu_graphs_for_inference",
-                        "--ignore_mismatched_sizes",
-                        "--dataloader_num_workers 1",
-                        "--pipelining_fwd_bwd True",
-                        "--non_blocking_data_copy True"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/t5_small.json b/tests/baselines/t5_small.json
deleted file mode 100644
index ebbb774f89..0000000000
--- a/tests/baselines/t5_small.json
+++ /dev/null
@@ -1,102 +0,0 @@
-{
-    "gaudi": {
-        "cnn_dailymail": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 4,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 4,
-                    "eval_rougeLsum": 38.5895,
-                    "train_runtime": 1089.366,
-                    "train_samples_per_second": 267.843,
-                    "eval_samples_per_second": 71.913,
-                    "extra_arguments": [
-                        "--dataset_config \"3.0.0\"",
-                        "--source_prefix \"summarize: \"",
-                        "--predict_with_generate",
-                        "--ignore_pad_token_for_loss False",
-                        "--pad_to_max_length",
-                        "--use_hpu_graphs_for_inference",
-                        "--save_strategy epoch"
-                    ]
-                }
-            }
-        },
-        "squad_v2": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 33,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 2e-4,
-                    "train_batch_size": 16,
-                    "eval_f1": 64.8769,
-                    "train_runtime": 230.6405,
-                    "train_samples_per_second": 1235.893,
-                    "extra_arguments": [
-                        "--context_column context",
-                        "--question_column question",
-                        "--answer_column answers",
-                        "--version_2_with_negative",
-                        "--max_seq_length 384",
-                        "--predict_with_generate",
-                        "--ignore_pad_token_for_loss False",
-                        "--pad_to_max_length",
-                        "--use_hpu_graphs_for_inference",
-                        "--save_strategy epoch"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "cnn_dailymail": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 4,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 2e-4,
-                    "train_batch_size": 32,
-                    "eval_rougeLsum": 38.5648,
-                    "train_runtime": 164.962,
-                    "train_samples_per_second": 1912.578,
-                    "eval_samples_per_second": 116.48,
-                    "extra_arguments": [
-                        "--dataset_config \"3.0.0\"",
-                        "--source_prefix \"summarize: \"",
-                        "--predict_with_generate",
-                        "--ignore_pad_token_for_loss False",
-                        "--pad_to_max_length",
-                        "--use_hpu_graphs_for_inference",
-                        "--save_strategy epoch"
-                    ]
-                }
-            }
-        },
-        "squad_v2": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 33,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 2e-3,
-                    "train_batch_size": 64,
-                    "eval_f1": 65.7157,
-                    "train_runtime": 49.5816,
-                    "train_samples_per_second": 6353.351,
-                    "extra_arguments": [
-                        "--context_column context",
-                        "--question_column question",
-                        "--answer_column answers",
-                        "--version_2_with_negative",
-                        "--max_seq_length 384",
-                        "--predict_with_generate",
-                        "--ignore_pad_token_for_loss False",
-                        "--pad_to_max_length",
-                        "--use_hpu_graphs_for_inference",
-                        "--save_strategy epoch"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/vit_base_patch16_224_in21k.json b/tests/baselines/vit_base_patch16_224_in21k.json
deleted file mode 100644
index 03cd9f6131..0000000000
--- a/tests/baselines/vit_base_patch16_224_in21k.json
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-    "gaudi": {
-        "cifar10": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 64,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 5e-5,
-                    "train_batch_size": 64,
-                    "eval_accuracy": 0.9812,
-                    "train_runtime": 136.9418,
-                    "train_samples_per_second": 359.584,
-                    "extra_arguments": [
-                        "--remove_unused_columns False",
-                        "--image_column_name img",
-                        "--seed 1337",
-                        "--use_hpu_graphs_for_inference",
-                        "--dataloader_num_workers 1",
-                        "--pipelining_fwd_bwd True",
-                        "--non_blocking_data_copy True"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 2e-4,
-                    "train_batch_size": 64,
-                    "eval_accuracy": 0.9803,
-                    "train_runtime": 59.972,
-                    "train_samples_per_second": 2508.955,
-                    "extra_arguments": [
-                        "--remove_unused_columns False",
-                        "--image_column_name img",
-                        "--seed 1337",
-                        "--use_hpu_graphs_for_inference",
-                        "--dataloader_num_workers 1",
-                        "--pipelining_fwd_bwd True",
-                        "--non_blocking_data_copy True",
-                        "--throughput_warmup_steps 10"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "cifar10": {
-            "num_train_epochs": 1,
-            "eval_batch_size": 64,
-            "distribution": {
-                "single_card": {
-                    "learning_rate": 6e-5,
-                    "train_batch_size": 96,
-                    "eval_accuracy": 0.9813,
-                    "train_runtime": 53.4501,
-                    "train_samples_per_second": 931.955,
-                    "extra_arguments": [
-                        "--remove_unused_columns False",
-                        "--image_column_name img",
-                        "--seed 1337",
-                        "--use_hpu_graphs_for_inference",
-                        "--dataloader_num_workers 1",
-                        "--pipelining_fwd_bwd True",
-                        "--non_blocking_data_copy True"
-                    ]
-                },
-                "multi_card": {
-                    "learning_rate": 5e-4,
-                    "train_batch_size": 96,
-                    "eval_accuracy": 0.9775,
-                    "train_runtime": 22.8292,
-                    "train_samples_per_second": 7337.003,
-                    "extra_arguments": [
-                        "--remove_unused_columns False",
-                        "--image_column_name img",
-                        "--seed 1337",
-                        "--use_hpu_graphs_for_inference",
-                        "--dataloader_num_workers 1",
-                        "--pipelining_fwd_bwd True",
-                        "--non_blocking_data_copy True",
-                        "--throughput_warmup_steps 8"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/wav2vec2_base.json b/tests/baselines/wav2vec2_base.json
deleted file mode 100644
index 3927ec4a5b..0000000000
--- a/tests/baselines/wav2vec2_base.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-    "gaudi": {
-        "common_language": {
-            "num_train_epochs": 10,
-            "eval_batch_size": 64,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-4,
-                    "train_batch_size": 32,
-                    "eval_accuracy": 0.8013,
-                    "train_runtime": 366.8081,
-                    "train_samples_per_second": 716.385,
-                    "eval_samples_per_second": 329.12,
-                    "extra_arguments": [
-                        "--audio_column_name audio",
-                        "--label_column_name language",
-                        "--remove_unused_columns False",
-                        "--max_length_seconds 8",
-                        "--attention_mask False",
-                        "--warmup_ratio 0.1",
-                        "--seed 0",
-                        "--dataloader_num_workers 1",
-                        "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "common_language": {
-            "num_train_epochs": 10,
-            "eval_batch_size": 64,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 5e-4,
-                    "train_batch_size": 32,
-                    "eval_accuracy": 0.8006,
-                    "train_runtime": 109.2047,
-                    "train_samples_per_second": 3048.207,
-                    "eval_samples_per_second": 631.601,
-                    "extra_arguments": [
-                        "--audio_column_name audio",
-                        "--label_column_name language",
-                        "--remove_unused_columns False",
-                        "--max_length_seconds 8",
-                        "--attention_mask False",
-                        "--warmup_ratio 0.1",
-                        "--seed 0",
-                        "--dataloader_num_workers 1",
-                        "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/wav2vec2_large_lv60.json b/tests/baselines/wav2vec2_large_lv60.json
deleted file mode 100644
index d645ced656..0000000000
--- a/tests/baselines/wav2vec2_large_lv60.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-    "gaudi": {
-        "regisss/librispeech_asr_for_optimum_habana_ci": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 8,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 6e-4,
-                    "train_batch_size": 8,
-                    "eval_wer": 0.0496,
-                    "train_runtime": 984.3022,
-                    "train_samples_per_second": 63.043,
-                    "eval_samples_per_second": 54.189,
-                    "extra_arguments": [
-                        "--dataset_config_name clean",
-                        "--train_split_name train.100",
-                        "--eval_split_name validation",
-                        "--preprocessing_num_workers 64",
-                        "--warmup_steps 500",
-                        "--text_column_name text",
-                        "--layerdrop 0.0",
-                        "--freeze_feature_encoder",
-                        "--dataloader_num_workers 8",
-                        "--chars_to_ignore ',?.!-;:\"“%‘”'"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "regisss/librispeech_asr_for_optimum_habana_ci": {
-            "num_train_epochs": 2,
-            "eval_batch_size": 8,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 4e-4,
-                    "train_batch_size": 8,
-                    "eval_wer": 0.06120587068623562,
-                    "train_runtime": 308.8036,
-                    "train_samples_per_second": 225.572,
-                    "eval_samples_per_second": 196.665,
-                    "extra_arguments": [
-                        "--dataset_config_name clean",
-                        "--train_split_name train.100",
-                        "--eval_split_name validation",
-                        "--preprocessing_num_workers 1",
-                        "--warmup_steps 500",
-                        "--text_column_name text",
-                        "--layerdrop 0.0",
-                        "--freeze_feature_encoder",
-                        "--dataloader_num_workers 8",
-                        "--chars_to_ignore ',?.!-;:\"“%‘”'",
-                        "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/baselines/whisper_small.json b/tests/baselines/whisper_small.json
deleted file mode 100644
index 6f3b01a412..0000000000
--- a/tests/baselines/whisper_small.json
+++ /dev/null
@@ -1,67 +0,0 @@
-{
-    "gaudi": {
-        "mozilla-foundation/common_voice_11_0": {
-            "num_train_epochs": 10,
-            "eval_batch_size": 2,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 1e-4,
-                    "train_batch_size": 8,
-                    "eval_wer": 2.1133,
-                    "train_runtime": 551.3249,
-                    "train_samples_per_second": 145.59,
-                    "eval_samples_per_second": 6.851,
-                    "extra_arguments": [
-                        "--dataset_config_name hi",
-                        "--language hindi",
-                        "--task transcribe",
-                        "--train_split_name train+validation",
-                        "--eval_split_name test",
-                        "--preprocessing_num_workers 1",
-                        "--generation_max_length 225",
-                        "--max_duration_in_seconds 30",
-                        "--text_column_name sentence",
-                        "--freeze_feature_encoder False",
-                        "--dataloader_num_workers 8",
-                        "--predict_with_generate",
-                        "--use_hpu_graphs_for_inference",
-                        "--label_features_max_length 128",
-                        "--pipelining_fwd_bwd True"
-                    ]
-                }
-            }
-        }
-    },
-    "gaudi2": {
-        "mozilla-foundation/common_voice_11_0": {
-            "num_train_epochs": 10,
-            "eval_batch_size": 8,
-            "distribution": {
-                "multi_card": {
-                    "learning_rate": 8e-5,
-                    "train_batch_size": 32,
-                    "eval_wer": 0.8477,
-                    "train_runtime": 287.0947,
-                    "train_samples_per_second": 307.526,
-                    "eval_samples_per_second": 12.069,
-                    "extra_arguments": [
-                        "--dataset_config_name hi",
-                        "--language hindi",
-                        "--task transcribe",
-                        "--train_split_name train+validation",
-                        "--eval_split_name test",
-                        "--preprocessing_num_workers 1",
-                        "--generation_max_length 225",
-                        "--max_duration_in_seconds 30",
-                        "--text_column_name sentence",
-                        "--freeze_feature_encoder False",
-                        "--dataloader_num_workers 8",
-                        "--predict_with_generate",
-                        "--use_hpu_graphs_for_inference",
-                        "--label_features_max_length 128"
-                    ]
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/tests/ci/albert_xxl_1x.sh b/tests/ci/albert_xxl_1x.sh
deleted file mode 100644
index c2c9ef2918..0000000000
--- a/tests/ci/albert_xxl_1x.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-export RUN_SLOW=true
-export RUN_ALBERT_XXL_1X=true
-python -m pip install .[tests]
-python -m pytest tests/test_examples.py -v -s -k "albert-xxlarge-v1_single_card"
diff --git a/tests/ci/example_diff_tests.sh b/tests/ci/example_diff_tests.sh
deleted file mode 100644
index b7920423bb..0000000000
--- a/tests/ci/example_diff_tests.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-make example_diff_tests
diff --git a/tests/ci/fast_tests.sh b/tests/ci/fast_tests.sh
deleted file mode 100644
index 7f1346384e..0000000000
--- a/tests/ci/fast_tests.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-make fast_tests
diff --git a/tests/ci/fast_tests_diffusers.sh b/tests/ci/fast_tests_diffusers.sh
deleted file mode 100644
index 9121bac16e..0000000000
--- a/tests/ci/fast_tests_diffusers.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-make fast_tests_diffusers
diff --git a/tests/ci/sentence_transformers.sh b/tests/ci/sentence_transformers.sh
deleted file mode 100644
index 838706874e..0000000000
--- a/tests/ci/sentence_transformers.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-python -m pip install /root/workspace/optimum-habana[tests]
-cd /root/workspace/sentence-transformers/tests
-python -m pip install ..
-python -m pytest test_compute_embeddings.py test_evaluator.py test_multi_process.py test_pretrained_stsb.py test_util.py
-cd /root/workspace/optimum-habana/tests
-python -m pytest test_sentence_transformers.py
diff --git a/tests/ci/slow_tests_1x.sh b/tests/ci/slow_tests_1x.sh
deleted file mode 100644
index e5f23e93dc..0000000000
--- a/tests/ci/slow_tests_1x.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-export RUN_SLOW=true
-make slow_tests_1x
diff --git a/tests/ci/slow_tests_8x.sh b/tests/ci/slow_tests_8x.sh
deleted file mode 100644
index cd2ace7580..0000000000
--- a/tests/ci/slow_tests_8x.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-export RUN_SLOW=true
-make slow_tests_8x
diff --git a/tests/ci/slow_tests_deepspeed.sh b/tests/ci/slow_tests_deepspeed.sh
deleted file mode 100644
index ad94f8307f..0000000000
--- a/tests/ci/slow_tests_deepspeed.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-export RUN_SLOW=true
-make slow_tests_deepspeed
diff --git a/tests/ci/slow_tests_diffusers.sh b/tests/ci/slow_tests_diffusers.sh
deleted file mode 100644
index ab776092a5..0000000000
--- a/tests/ci/slow_tests_diffusers.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-export RUN_SLOW=true
-make test_installs
-CUSTOM_BF16_OPS=1 python -m pytest tests/test_diffusers.py -v -s -k "test_no_throughput_regression_autocast"
-make slow_tests_diffusers
diff --git a/tests/ci/slow_tests_trl.sh b/tests/ci/slow_tests_trl.sh
deleted file mode 100644
index 90a81ec892..0000000000
--- a/tests/ci/slow_tests_trl.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-python -m pip install --upgrade pip
-export RUN_SLOW=true
-make slow_tests_trl
diff --git a/tests/clip_coco_utils.py b/tests/clip_coco_utils.py
deleted file mode 100644
index effced891f..0000000000
--- a/tests/clip_coco_utils.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-
-# Calculate CLIP score
-from functools import partial
-from pathlib import Path
-from urllib.request import urlretrieve
-
-import torch
-from torchmetrics.functional.multimodal import clip_score
-from transformers import AutoImageProcessor, AutoTokenizer, VisionTextDualEncoderModel, VisionTextDualEncoderProcessor
-
-
-COCO_URLS = [
-    "http://images.cocodataset.org/zips/train2017.zip",
-    "http://images.cocodataset.org/zips/val2017.zip",
-    "http://images.cocodataset.org/zips/test2017.zip",
-    "http://images.cocodataset.org/annotations/annotations_trainval2017.zip",
-    "http://images.cocodataset.org/annotations/image_info_test2017.zip",
-]
-
-
-def download_files(list_of_urls, path=None):
-    if path is None:
-        path = os.getcwd()
-
-    for url in list_of_urls:
-        print(f"Downloading {url}")
-        filename = url.split("/")[-1]
-        urlretrieve(url, Path(path, filename))
-        print(f"{url} downloaded.")
-
-
-def create_clip_roberta_model():
-    print("Generating a CLIP-RoBERTa model...")
-
-    model = VisionTextDualEncoderModel.from_vision_text_pretrained("openai/clip-vit-base-patch32", "roberta-base")
-
-    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-    image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
-    processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
-
-    # save the model and processor
-    model.save_pretrained("clip-roberta")
-    processor.save_pretrained("clip-roberta")
-
-    print("Model generated.")
-
-
-clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")
-
-
-def calculate_clip_score(images, prompts):
-    images_int = (images * 255).astype("uint8")
-    clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
-    return round(float(clip_score), 4)
diff --git a/tests/configs/bf16_ops.txt b/tests/configs/bf16_ops.txt
deleted file mode 100644
index e1ceba0591..0000000000
--- a/tests/configs/bf16_ops.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-add
-addmm
-bmm
-div
-dropout
-gelu
-iadd
-linear
-layer_norm
-matmul
-mm
-rsub
-softmax
-truediv
diff --git a/tests/configs/deepspeed_zero_1.json b/tests/configs/deepspeed_zero_1.json
deleted file mode 100644
index a3b8cafd4a..0000000000
--- a/tests/configs/deepspeed_zero_1.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "steps_per_print": 1,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 1
-    }
-}
diff --git a/tests/configs/deepspeed_zero_2.json b/tests/configs/deepspeed_zero_2.json
deleted file mode 100644
index 2b3634c472..0000000000
--- a/tests/configs/deepspeed_zero_2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "steps_per_print": 1,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "bf16": {
-        "enabled": true
-    },
-    "gradient_clipping": 1.0,
-    "zero_optimization": {
-        "stage": 2,
-        "overlap_comm": false,
-        "reduce_scatter": false,
-        "contiguous_gradients": false
-    }
-}
\ No newline at end of file
diff --git a/tests/configs/deepspeed_zero_3_gaudi1.json b/tests/configs/deepspeed_zero_3_gaudi1.json
deleted file mode 100644
index 9236ec1bce..0000000000
--- a/tests/configs/deepspeed_zero_3_gaudi1.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
-  "bf16": {
-    "enabled": true
-  },
-  "optimizer": {
-    "type": "adam",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto",
-      "torch_adam": "torch_impl",
-      "adam_w_mode": true
-    }
-  },
-  "scheduler": {
-    "type": "WarmupLR",
-    "params": {
-      "warmup_min_lr": "auto",
-      "warmup_max_lr": "auto",
-      "warmup_num_steps": "auto"
-    }
-  },
-  "zero_optimization": {
-    "stage": 3,
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 1e5,
-    "reduce_bucket_size": "auto",
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 1e5,
-    "stage3_max_reuse_distance": 1e5,
-    "stage3_gather_16bit_weights_on_model_save": true
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "steps_per_print": 2000,
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
diff --git a/tests/configs/fp32_ops.txt b/tests/configs/fp32_ops.txt
deleted file mode 100644
index f1edc6075c..0000000000
--- a/tests/configs/fp32_ops.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-embedding
-nll_loss
-log_softmax
diff --git a/tests/configs/gaudi_config_trainer_test.json b/tests/configs/gaudi_config_trainer_test.json
deleted file mode 100644
index c9fbc2a8c3..0000000000
--- a/tests/configs/gaudi_config_trainer_test.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-    "use_fused_adam": true,
-    "use_fused_clip_norm": true
-}
\ No newline at end of file
diff --git a/tests/create_diff_file_for_example.py b/tests/create_diff_file_for_example.py
deleted file mode 100644
index 2a169573f7..0000000000
--- a/tests/create_diff_file_for_example.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tool to create or update a diff file between transformers and optimum examples."""
-
-import re
-import subprocess
-import tempfile
-from argparse import ArgumentParser
-from pathlib import Path
-
-from git import Repo
-
-
-DIFF_DIRECTORY = Path(__file__).parent.resolve() / "example_diff"
-
-
-def _ask_yes_or_no_question(message: str) -> str:
-    if message[-1] == "?":
-        message = message[:-1]
-    message = f"{message} (y/n) ? "
-    continue_ = True
-    while continue_:
-        res = input(message)
-        if res not in ["y", "n"]:
-            print(f"You must answer by either y (yes) or n (no), but {res} was provided.\n")
-        else:
-            continue_ = False
-    return res
-
-
-def diff(filename1: Path, filename2: Path) -> str:
-    if not filename1.exists() or not filename2.exists():
-        raise FileNotFoundError(
-            f"Cannot compute the diff because at least one of the files does not exist: {filename1} and/or {filename2}."
-        )
-    cmd_line = ["diff", str(filename1), str(filename2)]
-    p = subprocess.Popen(cmd_line, stdout=subprocess.PIPE)
-    outs, _ = p.communicate()
-    return outs.decode("utf-8")
-
-
-def _colorize_lines(content):
-    lines = content.split("\n")
-    color_mapping = {
-        "<": "\033[0;31m",  # Red
-        ">": "\033[0;32m",  # Green
-        "-": "",
-        "default": "\033[0;36m",  # Blue
-    }
-    end_color = "\033[0;0m"
-    for i, line in enumerate(lines):
-        if not line:
-            continue
-        start_char = color_mapping.get(line[0], color_mapping["default"])
-        lines[i] = "".join([start_char, line, end_color])
-    return "\n".join(lines)
-
-
-def create_diff_content(raw_diff: str, keep_all_diffs: bool = False) -> str:
-    matches = list(re.finditer(r"^[^><-]+", raw_diff, flags=re.MULTILINE))
-    final_diff = []
-    for m1, m2 in zip(matches, matches[1:] + [None]):
-        start, end = m1.span()[0], m2.span()[0] if m2 is not None else None
-        if end is not None and raw_diff[end - 1] == "\n":
-            end = end - 1
-        content = raw_diff[start:end]
-        if not keep_all_diffs:
-            print(_colorize_lines(content))
-            keep_diff = _ask_yes_or_no_question("Keep this diff")
-            if keep_diff == "n":
-                continue
-        final_diff.append(content)
-    return "\n".join(final_diff)
-
-
-def auto_diff():
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # Clone the Transformers GH repo
-        Repo.clone_from("https://github.com/huggingface/transformers.git", tmpdirname)
-
-        # Get paths to Optimum and Transformers examples
-        path_to_optimum_examples = Path(__file__).resolve().parent / "../examples/"
-        optimum_example_dirs = [directory for directory in path_to_optimum_examples.iterdir() if directory.is_dir()]
-        path_to_transformers_examples = Path(f"{tmpdirname}/examples/pytorch/")
-        transformers_example_dirs = [
-            directory for directory in path_to_transformers_examples.iterdir() if directory.is_dir()
-        ]
-
-        # Loop over Optimum examples to compare them with their Transformers counterpart
-        for directory in optimum_example_dirs:
-            # Check if the example is in Transformers
-            if directory.name in [folder.name for folder in transformers_example_dirs]:
-                path_to_transformers = path_to_transformers_examples / directory.name
-                # Loop over all the "run_*.py" scripts in the example folder
-                for file in directory.iterdir():
-                    if file.is_file() and file.name.startswith("run_"):
-                        transformers_file = path_to_transformers / file.name
-                        if transformers_file.is_file():
-                            final_diff = create_diff_content(
-                                diff(
-                                    transformers_file,
-                                    file,
-                                ),
-                                keep_all_diffs=True,
-                            )
-                            diff_filename = DIFF_DIRECTORY / f"{file.stem}.txt"
-                            with open(diff_filename, "w") as fp:
-                                fp.write(final_diff)
-
-
-def parse_args():
-    parser = ArgumentParser(
-        description="Tool to create or update a diff file between transformers and optimum examples."
-    )
-    parser.add_argument("--transformers", type=Path, help="The path to the transformers example")
-    parser.add_argument("--optimum", type=Path, help="The path to the optimum example")
-    parser.add_argument(
-        "--auto",
-        action="store_true",
-        help="Whether to automatically write diff files or not. If true, all diffs will be accepted.",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    if args.auto:
-        auto_diff()
-    else:
-        if args.transformers is None and args.optimum is None:
-            raise ValueError("`--transformers` and `--optimum` must be both set if `--auto` is not set.")
-        raw_diff = diff(args.transformers, args.optimum)
-        print(f"Creating the diff file between {args.transformers} and {args.optimum}:\n")
-        final_diff = create_diff_content(raw_diff)
-        print(f"Difference between {args.transformers} and {args.optimum}:\n")
-        print(_colorize_lines(final_diff))
-        print("\n")
-
-        default_filename = DIFF_DIRECTORY / f"{args.transformers.stem}.txt"
-        filename = input(f"Would you like to save this file at {default_filename} (y/n/other path)? ")
-        if filename == "y":
-            filename = default_filename
-        if filename != "n":
-            filename = Path(filename)
-            should_override = True
-            if filename.exists():
-                should_override = _ask_yes_or_no_question("This file already exists, do you want to overwrite it")
-                should_override = should_override == "y"
-
-            if should_override:
-                with open(filename, "w") as fp:
-                    fp.write(final_diff)
-
-                print(f"Content saved at: {filename}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/example_diff/run_audio_classification.txt b/tests/example_diff/run_audio_classification.txt
deleted file mode 100644
index 278d3485ff..0000000000
--- a/tests/example_diff/run_audio_classification.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-20d19
-< import warnings
-28,29d26
-< from datasets import DatasetDict, load_dataset
-< 
-31,39c28,29
-< from transformers import (
-<     AutoConfig,
-<     AutoFeatureExtractor,
-<     AutoModelForAudioClassification,
-<     HfArgumentParser,
-<     Trainer,
-<     TrainingArguments,
-<     set_seed,
-< )
----
-> from datasets import DatasetDict, load_dataset
-> from transformers import AutoConfig, AutoFeatureExtractor, AutoModelForAudioClassification, HfArgumentParser
-43a34,44
-> from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-> from optimum.habana.utils import set_seed
-> 
-> 
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-47,48c48,50
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
----
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-174,176d175
-<     freeze_feature_extractor: Optional[bool] = field(
-<         default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
-<     )
-182,196d180
-<     def __post_init__(self):
-<         if not self.freeze_feature_extractor and self.freeze_feature_encoder:
-<             warnings.warn(
-<                 "The argument `--freeze_feature_extractor` is deprecated and "
-<                 "will be removed in a future version. Use `--freeze_feature_encoder` "
-<                 "instead. Setting `freeze_feature_encoder==True`.",
-<                 FutureWarning,
-<             )
-<         if self.freeze_feature_extractor and not self.freeze_feature_encoder:
-<             raise ValueError(
-<                 "The argument `--freeze_feature_extractor` is deprecated and "
-<                 "should not be used in combination with `--freeze_feature_encoder`. "
-<                 "Only make use of `--freeze_feature_encoder`."
-<             )
-< 
-203c187
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-231a216,222
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-232a224
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-234,235c226,228
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-302a296,298
->     # Max input length
->     max_length = int(round(feature_extractor.sampling_rate * data_args.max_length_seconds))
-> 
-307a304
-> 
-313c310,316
-<         inputs = feature_extractor(subsampled_wavs, sampling_rate=feature_extractor.sampling_rate)
----
->         inputs = feature_extractor(
->             subsampled_wavs,
->             max_length=max_length,
->             sampling_rate=feature_extractor.sampling_rate,
->             padding="max_length",
->             truncation=True,
->         )
-322c325,331
-<         inputs = feature_extractor(wavs, sampling_rate=feature_extractor.sampling_rate)
----
->         inputs = feature_extractor(
->             wavs,
->             max_length=max_length,
->             sampling_rate=feature_extractor.sampling_rate,
->             padding="max_length",
->             truncation=True,
->         )
-368,369c377,378
-<     # freeze the convolutional waveform encoder
-<     if model_args.freeze_feature_encoder:
----
->     # freeze the convolutional waveform encoder if supported by model
->     if hasattr(model, "freeze_feature_encoder") and model_args.freeze_feature_encoder:
-389c398
-<     trainer = Trainer(
----
->     trainer = GaudiTrainer(
-390a400
->         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_clip.txt b/tests/example_diff/run_clip.txt
deleted file mode 100644
index 2eebcc2d7b..0000000000
--- a/tests/example_diff/run_clip.txt
+++ /dev/null
@@ -1,102 +0,0 @@
-18d17
-< 
-32a32
-> import transformers
-33a34
-> from habana_dataloader_trainer import HabanaDataloaderTrainer
-38,39d38
-< 
-< import transformers
-45,47d43
-<     Trainer,
-<     TrainingArguments,
-<     set_seed,
-52a49,59
-> from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-> from optimum.habana.utils import set_seed
-> 
-> 
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-56,57c63,65
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
----
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-181a190,192
->     mediapipe_dataloader: bool = field(
->         default=False, metadata={"help": "Turn on MediaPipe hardware-based accelerated data loading."}
->     )
-240c251
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-268a280,286
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-269a288
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-271,272c290,292
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-419d438
-<     image_transformations = torch.jit.script(image_transformations)
-466,467c485,493
-<         # Transform images on the fly as doing it on the whole dataset takes too much time.
-<         train_dataset.set_transform(transform_images)
----
->         if data_args.mediapipe_dataloader:
->             train_dataset.image_mean = image_processor.image_mean
->             train_dataset.image_std = image_processor.image_std
->             train_dataset.text_max_length = data_args.max_seq_length
->             train_dataset.image_resize = config.vision_config.image_size
->             train_dataset.transform_func = transform_images
->         else:
->             # Transform images on the fly as doing it on the whole dataset takes too much time.
->             train_dataset.set_transform(transform_images)
-489,490c515,523
-<         # Transform images on the fly as doing it on the whole dataset takes too much time.
-<         eval_dataset.set_transform(transform_images)
----
->         if data_args.mediapipe_dataloader:
->             eval_dataset.image_mean = image_processor.image_mean
->             eval_dataset.image_std = image_processor.image_std
->             eval_dataset.text_max_length = data_args.max_seq_length
->             eval_dataset.image_resize = config.vision_config.image_size
->             eval_dataset.transform_func = transform_images
->         else:
->             # Transform images on the fly as doing it on the whole dataset takes too much time.
->             eval_dataset.set_transform(transform_images)
-513a547,555
->         if data_args.mediapipe_dataloader:
->             test_dataset.image_mean = image_processor.image_mean
->             test_dataset.image_std = image_processor.image_std
->             test_dataset.text_max_length = data_args.max_seq_length
->             test_dataset.image_resize = config.vision_config.image_size
->             test_dataset.transform_func = transform_images
->         else:
->             # Transform images on the fly as doing it on the whole dataset takes too much time.
->             test_dataset.set_transform(transform_images)
-516c558,559
-<     trainer = Trainer(
----
->     trainer_cls = HabanaDataloaderTrainer if data_args.mediapipe_dataloader else GaudiTrainer
->     trainer = trainer_cls(
-517a561
->         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_clm.txt b/tests/example_diff/run_clm.txt
deleted file mode 100644
index 7db8099ecf..0000000000
--- a/tests/example_diff/run_clm.txt
+++ /dev/null
@@ -1,156 +0,0 @@
-3c3
-< # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
----
-> # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-17,19c17,18
-< Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
-< 
-< Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
----
-> Training the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
-> Here is the full list of checkpoints on the hub that can be trained by this script:
-35,36d33
-< from datasets import load_dataset
-< 
-37a35
-> from datasets import load_dataset
-45,46d42
-<     Trainer,
-<     TrainingArguments,
-48,49d43
-<     is_torch_xla_available,
-<     set_seed,
-55a50,51
-> from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-> from optimum.habana.utils import set_seed
-57,58d52
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
-60c54,60
-< require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
----
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-63a64,69
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-> 
-> require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-> 
-79c85,86
-<                 "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
----
->                 "The model checkpoint for weights initialization. Don't set it if you want to train a model from"
->                 " scratch."
-142a150,158
->     use_cache: bool = field(
->         default=True,
->         metadata={
->             "help": (
->                 "Whether or not the model should return the last key/values attentions (not used by all models)."
->                 "Only relevant if `config.is_decoder=True`."
->             )
->         },
->     )
-148c164
-<                 "set True will benefit LLM loading time and RAM consumption."
----
->                 "Setting it to True will benefit LLM loading time and RAM consumption."
-195c211,212
-<     streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
----
-> 
->     streaming: bool = field(default=False, metadata={"help": "Enable streaming mode."})
-221a239,241
->     save_last_ckpt: bool = field(
->         default=True, metadata={"help": "Whether to save checkpoint at the end of the training."}
->     )
-243c263
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-272a293,299
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-273a301
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-275,276c303,305
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-387a417
->         "use_cache": False if training_args.gradient_checkpointing else model_args.use_cache,
-483a514
-> 
-547a579,582
-> 
->         def tensor_mapper(x):
->             return {i: torch.tensor(x[i], dtype=torch.int32) for i in x}
-> 
-550a586,587
->         if training_args.resume_from_checkpoint is not None and training_args.resume_from_checkpoint != "":
->             train_dataset = train_dataset.map(tensor_mapper)
-581c618
-<     trainer = Trainer(
----
->     trainer = GaudiTrainer(
-582a620
->         gaudi_config=gaudi_config,
-589,592c627,628
-<         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
-<         preprocess_logits_for_metrics=preprocess_logits_for_metrics
-<         if training_args.do_eval and not is_torch_xla_available()
-<         else None,
----
->         compute_metrics=compute_metrics if training_args.do_eval else None,
->         preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
-603c639,640
-<         trainer.save_model()  # Saves the tokenizer too for easy upload
----
->         if data_args.save_last_ckpt:
->             trainer.save_model()  # Saves the tokenizer too for easy upload
-607,610c644,650
-<         max_train_samples = (
-<             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-<         )
-<         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
----
->         if data_args.streaming:
->             metrics["train_samples"] = training_args.max_steps * training_args.per_device_train_batch_size
->         else:
->             max_train_samples = (
->                 data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
->             )
->             metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-619d658
-< 
-622,623c661,666
-<         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-<         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
----
->         if not data_args.streaming:
->             max_eval_samples = (
->                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
->             )
->             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-> 
-646,650d688
-< 
-< 
-< def _mp_fn(index):
-<     # For xla_spawn (TPUs)
-<     main()
diff --git a/tests/example_diff/run_generation.txt b/tests/example_diff/run_generation.txt
deleted file mode 100644
index 5da903f6e8..0000000000
--- a/tests/example_diff/run_generation.txt
+++ /dev/null
@@ -1,1035 +0,0 @@
-17c17,19
-< """Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)"""
----
-> """
-> Conditional text generation on Habana Gaudi/Gaudi2.
-> """
-20c22
-< import inspect
----
-> import json
-22c24,28
-< from typing import Tuple
----
-> import math
-> import os
-> import time
-> from itertools import cycle
-> from pathlib import Path
-25,26c31
-< from accelerate import PartialState
-< from accelerate.utils import set_seed
----
-> from utils import adjust_batch, count_hpu_graphs, initialize_model
-28,50c33
-< from transformers import (
-<     AutoTokenizer,
-<     BloomForCausalLM,
-<     BloomTokenizerFast,
-<     CTRLLMHeadModel,
-<     CTRLTokenizer,
-<     GenerationMixin,
-<     GPT2LMHeadModel,
-<     GPT2Tokenizer,
-<     GPTJForCausalLM,
-<     LlamaForCausalLM,
-<     LlamaTokenizer,
-<     OpenAIGPTLMHeadModel,
-<     OpenAIGPTTokenizer,
-<     OPTForCausalLM,
-<     TransfoXLLMHeadModel,
-<     TransfoXLTokenizer,
-<     XLMTokenizer,
-<     XLMWithLMHeadModel,
-<     XLNetLMHeadModel,
-<     XLNetTokenizer,
-< )
-< from transformers.modeling_outputs import CausalLMOutputWithPast
----
-> from optimum.habana.utils import get_hpu_memory_stats
-60,282d42
-< MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
-< 
-< MODEL_CLASSES = {
-<     "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
-<     "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
-<     "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-<     "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
-<     "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
-<     "xlm": (XLMWithLMHeadModel, XLMTokenizer),
-<     "gptj": (GPTJForCausalLM, AutoTokenizer),
-<     "bloom": (BloomForCausalLM, BloomTokenizerFast),
-<     "llama": (LlamaForCausalLM, LlamaTokenizer),
-<     "opt": (OPTForCausalLM, GPT2Tokenizer),
-< }
-< 
-< # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
-< # in https://github.com/rusiaaman/XLNet-gen#methodology
-< # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
-< PREFIX = """In 1991, the remains of Russian Tsar Nicholas II and his family
-< (except for Alexei and Maria) are discovered.
-< The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-< remainder of the story. 1883 Western Siberia,
-< a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-< Rasputin has a vision and denounces one of the men as a horse thief. Although his
-< father initially slaps him for making such an accusation, Rasputin watches as the
-< man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-< the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-< with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
-< 
-< 
-< #
-< # Functions to prepare models' input
-< #
-< 
-< 
-< def prepare_ctrl_input(args, _, tokenizer, prompt_text):
-<     if args.temperature > 0.7:
-<         logger.info("CTRL typically works better with lower temperatures (and lower top_k).")
-< 
-<     encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
-<     if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
-<         logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
-<     return prompt_text
-< 
-< 
-< def prepare_xlm_input(args, model, tokenizer, prompt_text):
-<     # kwargs = {"language": None, "mask_token_id": None}
-< 
-<     # Set the language
-<     use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb
-<     if hasattr(model.config, "lang2id") and use_lang_emb:
-<         available_languages = model.config.lang2id.keys()
-<         if args.xlm_language in available_languages:
-<             language = args.xlm_language
-<         else:
-<             language = None
-<             while language not in available_languages:
-<                 language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
-< 
-<         model.config.lang_id = model.config.lang2id[language]
-<         # kwargs["language"] = tokenizer.lang2id[language]
-< 
-<     # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
-<     # XLM masked-language modeling (MLM) models need masked token
-<     # is_xlm_mlm = "mlm" in args.model_name_or_path
-<     # if is_xlm_mlm:
-<     #     kwargs["mask_token_id"] = tokenizer.mask_token_id
-< 
-<     return prompt_text
-< 
-< 
-< def prepare_xlnet_input(args, _, tokenizer, prompt_text):
-<     prefix = args.prefix if args.prefix else args.padding_text if args.padding_text else PREFIX
-<     prompt_text = prefix + prompt_text
-<     return prompt_text
-< 
-< 
-< def prepare_transfoxl_input(args, _, tokenizer, prompt_text):
-<     prefix = args.prefix if args.prefix else args.padding_text if args.padding_text else PREFIX
-<     prompt_text = prefix + prompt_text
-<     return prompt_text
-< 
-< 
-< PREPROCESSING_FUNCTIONS = {
-<     "ctrl": prepare_ctrl_input,
-<     "xlm": prepare_xlm_input,
-<     "xlnet": prepare_xlnet_input,
-<     "transfo-xl": prepare_transfoxl_input,
-< }
-< 
-< 
-< def adjust_length_to_model(length, max_sequence_length):
-<     if length < 0 and max_sequence_length > 0:
-<         length = max_sequence_length
-<     elif 0 < max_sequence_length < length:
-<         length = max_sequence_length  # No generation bigger than model size
-<     elif length < 0:
-<         length = MAX_LENGTH  # avoid infinite loop
-<     return length
-< 
-< 
-< def sparse_model_config(model_config):
-<     embedding_size = None
-<     if hasattr(model_config, "hidden_size"):
-<         embedding_size = model_config.hidden_size
-<     elif hasattr(model_config, "n_embed"):
-<         embedding_size = model_config.n_embed
-<     elif hasattr(model_config, "n_embd"):
-<         embedding_size = model_config.n_embd
-< 
-<     num_head = None
-<     if hasattr(model_config, "num_attention_heads"):
-<         num_head = model_config.num_attention_heads
-<     elif hasattr(model_config, "n_head"):
-<         num_head = model_config.n_head
-< 
-<     if embedding_size is None or num_head is None or num_head == 0:
-<         raise ValueError("Check the model config")
-< 
-<     num_embedding_size_per_head = int(embedding_size / num_head)
-<     if hasattr(model_config, "n_layer"):
-<         num_layer = model_config.n_layer
-<     elif hasattr(model_config, "num_hidden_layers"):
-<         num_layer = model_config.num_hidden_layers
-<     else:
-<         raise ValueError("Number of hidden layers couldn't be determined from the model config")
-< 
-<     return num_layer, num_head, num_embedding_size_per_head
-< 
-< 
-< def generate_past_key_values(model, batch_size, seq_len):
-<     num_block_layers, num_attention_heads, num_embedding_size_per_head = sparse_model_config(model.config)
-<     if model.config.model_type == "bloom":
-<         past_key_values = tuple(
-<             (
-<                 torch.empty(int(num_attention_heads * batch_size), num_embedding_size_per_head, seq_len)
-<                 .to(model.dtype)
-<                 .to(model.device),
-<                 torch.empty(int(num_attention_heads * batch_size), seq_len, num_embedding_size_per_head)
-<                 .to(model.dtype)
-<                 .to(model.device),
-<             )
-<             for _ in range(num_block_layers)
-<         )
-<     else:
-<         past_key_values = tuple(
-<             (
-<                 torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
-<                 .to(model.dtype)
-<                 .to(model.device),
-<                 torch.empty(batch_size, num_attention_heads, seq_len, num_embedding_size_per_head)
-<                 .to(model.dtype)
-<                 .to(model.device),
-<             )
-<             for _ in range(num_block_layers)
-<         )
-<     return past_key_values
-< 
-< 
-< def prepare_jit_inputs(inputs, model, tokenizer):
-<     batch_size = len(inputs)
-<     dummy_input = tokenizer.batch_encode_plus(inputs, return_tensors="pt")
-<     dummy_input = dummy_input.to(model.device)
-<     if model.config.use_cache:
-<         dummy_input["past_key_values"] = generate_past_key_values(model, batch_size, 1)
-<     dummy_input["attention_mask"] = torch.cat(
-<         [
-<             torch.zeros(dummy_input["attention_mask"].shape[0], 1)
-<             .to(dummy_input["attention_mask"].dtype)
-<             .to(model.device),
-<             dummy_input["attention_mask"],
-<         ],
-<         -1,
-<     )
-<     return dummy_input
-< 
-< 
-< class _ModelFallbackWrapper(GenerationMixin):
-<     __slots__ = ("_optimized", "_default")
-< 
-<     def __init__(self, optimized, default):
-<         self._optimized = optimized
-<         self._default = default
-< 
-<     def __call__(self, *args, **kwargs):
-<         if kwargs["past_key_values"] is None and self._default.config.use_cache:
-<             kwargs["past_key_values"] = generate_past_key_values(self._default, kwargs["input_ids"].shape[0], 0)
-<         kwargs.pop("position_ids", None)
-<         for k in list(kwargs.keys()):
-<             if kwargs[k] is None or isinstance(kwargs[k], bool):
-<                 kwargs.pop(k)
-<         outputs = self._optimized(**kwargs)
-<         lm_logits = outputs[0]
-<         past_key_values = outputs[1]
-<         fixed_output = CausalLMOutputWithPast(
-<             loss=None,
-<             logits=lm_logits,
-<             past_key_values=past_key_values,
-<             hidden_states=None,
-<             attentions=None,
-<         )
-<         return fixed_output
-< 
-<     def __getattr__(self, item):
-<         return getattr(self._default, item)
-< 
-<     def prepare_inputs_for_generation(
-<         self, input_ids, past_key_values=None, inputs_embeds=None, use_cache=None, **kwargs
-<     ):
-<         return self._default.prepare_inputs_for_generation(
-<             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, **kwargs
-<         )
-< 
-<     def _reorder_cache(
-<         self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
-<     ) -> Tuple[Tuple[torch.Tensor]]:
-<         """
-<         This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
-<         [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-<         beam_idx at every generation step.
-<         """
-<         return self._default._reorder_cache(past_key_values, beam_idx)
-< 
-284,285c44,46
-< def main():
-<     parser = argparse.ArgumentParser()
----
-> def setup_parser(parser):
->     # Arguments management
->     parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu")
-287c48
-<         "--model_type",
----
->         "--model_name_or_path",
-291c52
-<         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
----
->         help="Path to pre-trained model (on the HF Hub or locally).",
-294c55,83
-<         "--model_name_or_path",
----
->         "--bf16",
->         action="store_true",
->         help="Whether to perform generation in bf16 precision.",
->     )
->     parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
->     parser.add_argument(
->         "--max_input_tokens",
->         type=int,
->         default=0,
->         help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \
->             if == 0, then truncate to 16 (original default) \
->             if < 0, then do not truncate, use full input prompt",
->     )
->     parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
->     parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
->     parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
->     parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
->     parser.add_argument(
->         "--use_kv_cache",
->         action="store_true",
->         help="Whether to use the key/value cache for decoding. It should speed up generation.",
->     )
->     parser.add_argument(
->         "--use_hpu_graphs",
->         action="store_true",
->         help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
->     )
->     parser.add_argument(
->         "--dataset_name",
-297,298c86,92
-<         required=True,
-<         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
----
->         help="Optional argument if you want to assess your model on a given dataset of the HF Hub.",
->     )
->     parser.add_argument(
->         "--column_name",
->         default=None,
->         type=str,
->         help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.",
-300,304d93
-< 
-<     parser.add_argument("--prompt", type=str, default="")
-<     parser.add_argument("--length", type=int, default=20)
-<     parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
-< 
-306,309c95,97
-<         "--temperature",
-<         type=float,
-<         default=1.0,
-<         help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
----
->         "--do_sample",
->         action="store_true",
->         help="Whether to use sampling for generation.",
-312c100,233
-<         "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
----
->         "--num_beams",
->         default=1,
->         type=int,
->         help="Number of beams used for beam search generation. 1 means greedy search will be performed.",
->     )
->     parser.add_argument(
->         "--trim_logits",
->         action="store_true",
->         help="Calculate logits only for the last token to save memory in the first step.",
->     )
->     parser.add_argument(
->         "--seed",
->         default=27,
->         type=int,
->         help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.",
->     )
->     parser.add_argument(
->         "--profiling_warmup_steps",
->         default=0,
->         type=int,
->         help="Number of steps to ignore for profiling.",
->     )
->     parser.add_argument(
->         "--profiling_steps",
->         default=0,
->         type=int,
->         help="Number of steps to capture for profiling.",
->     )
->     parser.add_argument(
->         "--profiling_record_shapes",
->         default=False,
->         type=bool,
->         help="Record shapes when enabling profiling.",
->     )
->     parser.add_argument(
->         "--prompt",
->         default=None,
->         type=str,
->         nargs="*",
->         help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")',
->     )
->     parser.add_argument(
->         "--bad_words",
->         default=None,
->         type=str,
->         nargs="+",
->         help="Optional argument list of words that are not allowed to be generated.",
->     )
->     parser.add_argument(
->         "--force_words",
->         default=None,
->         type=str,
->         nargs="+",
->         help="Optional argument list of words that must be generated.",
->     )
->     parser.add_argument(
->         "--assistant_model",
->         default=None,
->         type=str,
->         help="Optional argument to give a path to a draft/assistant model for assisted decoding.",
->     )
->     parser.add_argument(
->         "--peft_model",
->         default=None,
->         type=str,
->         help="Optional argument to give a path to a PEFT model.",
->     )
->     parser.add_argument("--num_return_sequences", type=int, default=1)
->     parser.add_argument(
->         "--token",
->         default=None,
->         type=str,
->         help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
->         "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
->     )
->     parser.add_argument(
->         "--model_revision",
->         default="main",
->         type=str,
->         help="The specific model version to use (can be a branch name, tag name or commit id).",
->     )
->     parser.add_argument(
->         "--attn_softmax_bf16",
->         action="store_true",
->         help="Whether to run attention softmax layer in lower precision provided that the model supports it and "
->         "is also running in lower precision.",
->     )
->     parser.add_argument(
->         "--output_dir",
->         default=None,
->         type=str,
->         help="Output directory to store results in.",
->     )
->     parser.add_argument(
->         "--bucket_size",
->         default=-1,
->         type=int,
->         help="Bucket size to maintain static shapes. If this number is negative (default is -1) \
->             then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \
->             we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).",
->     )
->     parser.add_argument(
->         "--bucket_internal",
->         action="store_true",
->         help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.",
->     )
->     parser.add_argument(
->         "--dataset_max_samples",
->         default=-1,
->         type=int,
->         help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.",
->     )
->     parser.add_argument(
->         "--limit_hpu_graphs",
->         action="store_true",
->         help="Skip HPU Graph usage for first token to save memory",
->     )
->     parser.add_argument(
->         "--reuse_cache",
->         action="store_true",
->         help="Whether to reuse key/value cache for decoding. It should save memory.",
->     )
->     parser.add_argument("--verbose_workers", action="store_true", help="Enable output from non-master workers")
->     parser.add_argument(
->         "--simulate_dyn_prompt",
->         default=None,
->         type=int,
->         nargs="*",
->         help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.",
->     )
->     parser.add_argument(
->         "--reduce_recompile",
->         action="store_true",
->         help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
-314,319d234
-<     parser.add_argument("--k", type=int, default=0)
-<     parser.add_argument("--p", type=float, default=0.9)
-< 
-<     parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
-<     parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
-<     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
-321d235
-<     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-323c237,252
-<         "--use_cpu",
----
->         "--use_flash_attention",
->         action="store_true",
->         help="Whether to enable Habana Flash Attention, provided that the model supports it.",
->     )
->     parser.add_argument(
->         "--flash_attention_recompute",
->         action="store_true",
->         help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
->     )
->     parser.add_argument(
->         "--flash_attention_causal_mask",
->         action="store_true",
->         help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
->     )
->     parser.add_argument(
->         "--flash_attention_fast_softmax",
-325c254
-<         help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
----
->         help="Whether to enable Habana Flash Attention in fast softmax mode.",
-327d255
-<     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
-329c257
-<         "--fp16",
----
->         "--book_source",
-331c259,288
-<         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
----
->         help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.",
->     )
->     parser.add_argument(
->         "--torch_compile",
->         action="store_true",
->         help="Whether to use torch compiled model or not.",
->     )
->     parser.add_argument(
->         "--ignore_eos",
->         default=True,
->         action=argparse.BooleanOptionalAction,
->         help="Whether to ignore eos, set False to disable it",
->     )
->     parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation")
->     parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling")
->     parser.add_argument(
->         "--const_serialization_path",
->         "--csp",
->         type=str,
->         help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
->     )
->     parser.add_argument(
->         "--disk_offload",
->         action="store_true",
->         help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.",
->     )
->     parser.add_argument(
->         "--trust_remote_code",
->         action="store_true",
->         help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
-333d289
-<     parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
-336,339c292,293
-<     # Initialize the distributed state.
-<     distributed_state = PartialState(cpu=args.use_cpu)
-< 
-<     logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
----
->     if args.torch_compile:
->         args.use_hpu_graphs = False
-341,342c295,296
-<     if args.seed is not None:
-<         set_seed(args.seed)
----
->     if not args.use_hpu_graphs:
->         args.limit_hpu_graphs = False
-344,371c298,303
-<     # Initialize the model and tokenizer
-<     try:
-<         args.model_type = args.model_type.lower()
-<         model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-<     except KeyError:
-<         raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
-< 
-<     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
-<     if tokenizer.pad_token is None:
-<         tokenizer.pad_token = tokenizer.eos_token
-<     model = model_class.from_pretrained(args.model_name_or_path)
-< 
-<     # Set the model to the right device
-<     model.to(distributed_state.device)
-< 
-<     if args.fp16:
-<         model.half()
-<     max_seq_length = getattr(model.config, "max_position_embeddings", 0)
-<     args.length = adjust_length_to_model(args.length, max_sequence_length=max_seq_length)
-<     logger.info(args)
-< 
-<     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
-< 
-<     # Different models need different input formatting and/or extra arguments
-<     requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys()
-<     if requires_preprocessing:
-<         prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
-<         preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)
----
->     args.quant_config = os.getenv("QUANT_CONFIG", "")
->     if args.quant_config == "" and args.disk_offload:
->         logger.warning(
->             "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."
->         )
->     return args
-373,376d304
-<         if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
-<             tokenizer_kwargs = {"add_space_before_punct_symbol": True}
-<         else:
-<             tokenizer_kwargs = {}
-378,384c306,309
-<         encoded_prompt = tokenizer.encode(
-<             preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
-<         )
-<     else:
-<         prefix = args.prefix if args.prefix else args.padding_text
-<         encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
-<     encoded_prompt = encoded_prompt.to(distributed_state.device)
----
-> def main():
->     parser = argparse.ArgumentParser()
->     args = setup_parser(parser)
->     model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
-386,387c311,506
-<     if encoded_prompt.size()[-1] == 0:
-<         input_ids = None
----
->     use_lazy_mode = True
->     if args.torch_compile and model.config.model_type == "llama":
->         use_lazy_mode = False
-> 
->     import habana_frameworks.torch.hpu as torch_hpu
-> 
->     if args.dataset_name is None:
->         # Benchmark over the prompts below
->         if args.prompt:
->             input_sentences = args.prompt
->         elif args.book_source:
-> 
->             def download_book(book_id):
->                 import os
-> 
->                 import requests
-> 
->                 url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
->                 response = requests.get(url)
->                 if response.status_code == 200:
->                     pid = os.getpid()
->                     save_path = f"/tmp/{book_id}_{pid}.txt"
->                     with open(save_path, "wb") as file:
->                         file.write(response.content)
->                     print(f"Book downloaded and saved to: {save_path}")
->                     return save_path
->                 else:
->                     print("Failed to download book! Exiting...")
->                     import sys
-> 
->                     sys.exit()
-> 
->             def assemble_prompt(prompt_size, book_path):
->                 prompt = ""
->                 counter = 0
->                 book_lines = open(book_path).readlines()
->                 for line in book_lines:
->                     for word in line.split():
->                         counter += 1
->                         prompt += word + " "
->                         if counter == prompt_size:
->                             return [prompt] * args.batch_size
-> 
->             book_ids = [
->                 2701,  # Moby Dick; Or, The Whale
->                 1513,  # Romeo and Juliet
->                 1342,  # Pride and Prejudice
->             ]
->             input_sentences = assemble_prompt(prompt_size=args.max_input_tokens, book_path=download_book(book_ids[0]))
->         else:
->             input_sentences = [
->                 "DeepSpeed is a machine learning framework",
->                 "He is working on",
->                 "He has a",
->                 "He got all",
->                 "Everyone is happy and I can",
->                 "The new movie that got Oscar this year",
->                 "In the far far distance from our galaxy,",
->                 "Peace is the only way",
->             ]
-> 
->         if args.batch_size > len(input_sentences):
->             # Dynamically extends to support larger batch sizes
->             num_sentences_to_add = args.batch_size - len(input_sentences)
->             for i in range(num_sentences_to_add):
->                 input_sentences.append(input_sentences[i % len(input_sentences)])
->         elif args.batch_size < len(input_sentences):
->             input_sentences = input_sentences[: args.batch_size]
-> 
->         def generate(size=None, reduce_recompile=False):
->             """Generates sequences from the input sentences and returns them."""
->             encode_t0 = time.perf_counter()
->             # Tokenization
->             if args.max_input_tokens > 0:
->                 input_tokens = tokenizer.batch_encode_plus(
->                     input_sentences,
->                     return_tensors="pt",
->                     padding="max_length",
->                     max_length=args.max_input_tokens,
->                     truncation=True,
->                 )
->             else:
->                 input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True)
->             encode_duration = time.perf_counter() - encode_t0
-> 
->             if size is not None:
->                 input_tokens = adjust_batch(input_tokens, size)
->             if not reduce_recompile:
->                 # Move inputs to target device(s)
->                 for t in input_tokens:
->                     if torch.is_tensor(input_tokens[t]):
->                         input_tokens[t] = input_tokens[t].to(args.device)
->             iteration_times = []
->             outputs = model.generate(
->                 **input_tokens,
->                 generation_config=generation_config,
->                 assistant_model=assistant_model,
->                 lazy_mode=use_lazy_mode,
->                 hpu_graphs=args.use_hpu_graphs,
->                 profiling_steps=args.profiling_steps,
->                 profiling_warmup_steps=args.profiling_warmup_steps,
->                 ignore_eos=args.ignore_eos,
->                 iteration_times=iteration_times,
->                 profiling_record_shapes=args.profiling_record_shapes,
->             ).cpu()
->             first_token_time = iteration_times[0] + encode_duration
->             logger.info(f"Time to first token = {first_token_time*1000}ms")
->             return tokenizer.batch_decode(outputs, skip_special_tokens=True)
-> 
->         from optimum.habana.utils import HabanaProfile
-> 
->         # compilation stage disable profiling
->         HabanaProfile.disable()
->         # Compilation
->         logger.info("Graph compilation...")
->         dyn_prompt_lens = args.simulate_dyn_prompt
->         t0 = time.perf_counter()
->         # The first three iterations take longer because of graph compilation
->         if dyn_prompt_lens is None or len(set(dyn_prompt_lens)) == 1:
->             for _ in range(args.warmup):
->                 if dyn_prompt_lens is None:
->                     print("Warming up", flush=True)
->                     generate(None, args.reduce_recompile)
->                 else:
->                     print("Warming up for shape,", dyn_prompt_lens[0], flush=True)
->                     generate(dyn_prompt_lens[0], args.reduce_recompile)
->         else:
->             if args.bucket_size > 0:
->                 mn = min(dyn_prompt_lens)
->                 mx = max(dyn_prompt_lens)
-> 
->                 def rounder(x):
->                     return int(math.ceil(x / args.bucket_size) * args.bucket_size)
-> 
->                 min_prompt_len = rounder(mn)
->                 max_sentence_len = rounder(mx)
->                 for _ in range(args.warmup):
->                     lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
->                     for sz in lst:
->                         print("Warming up for shape,", sz - 1, flush=True)
->                         generate(sz - 1, args.reduce_recompile)
->         torch_hpu.synchronize()
->         compilation_duration = time.perf_counter() - t0
->         HabanaProfile.enable()
->         total_new_tokens_generated = 0
->         logger.info("Running generate...")
->         t0 = time.perf_counter()
->         # Benchmark over n_iterations iterations
->         if dyn_prompt_lens is None:
->             for i in range(args.n_iterations):
->                 generated = generate(None, args.reduce_recompile)
->         else:
->             repeated_prompt_len = cycle(dyn_prompt_lens)
->             for i in range(args.n_iterations):
->                 prompt_len = next(repeated_prompt_len)
->                 print("Generating for shape,", prompt_len)
->                 generated = generate(prompt_len, args.reduce_recompile)
->         duration = time.perf_counter() - t0
->         total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
->         throughput = total_new_tokens_generated / duration
-> 
->         print()
->         print("Input/outputs:")
->         for i, input_sentence in enumerate(zip(input_sentences)):
->             print(f"input {i+1}: {input_sentence}")
->             for j, output in enumerate(
->                 zip(generated[args.num_return_sequences * i : args.num_return_sequences * (i + 1)])
->             ):
->                 print(f"output {j+1}: {output}")
->             print()
-> 
->         # Store results if necessary
->         if args.output_dir is not None and args.global_rank == 0:
->             output_dir = Path(args.output_dir)
->             output_dir.mkdir(parents=True, exist_ok=True)
-> 
->             results = {
->                 "throughput": throughput,
->                 "output": output,
->             }
->             with (output_dir / "results.json").open("w", encoding="utf-8") as f:
->                 json.dump(results, f, ensure_ascii=False, indent=4)
-> 
->         stats = f"Throughput (including tokenization) = {throughput} tokens/second"
->         stats = stats + f"\nNumber of HPU graphs                = {count_hpu_graphs()}"
->         separator = "-" * len(stats)
->         print()
->         print("Stats:")
->         print(separator)
->         print(stats)
->         mem = get_hpu_memory_stats()
->         for k, v in mem.items():
->             print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
->         print(f"Graph compilation duration          = {compilation_duration} seconds")
->         print(separator)
->         print()
-389c508,525
-<         input_ids = encoded_prompt
----
->         # Downloading and loading a dataset from the hub.
->         from datasets import load_dataset
->         from torch.utils.data import DataLoader
-> 
->         assert not args.simulate_dyn_prompt, "Both dataset_name and simulate_dyn_prompt are set"
-> 
->         raw_dataset = load_dataset(args.dataset_name)
->         if "test" in raw_dataset:
->             split = "test"
->         elif "validation" in raw_dataset:
->             split = "validation"
->         else:
->             split = "train"
->         raw_dataset = (
->             raw_dataset[split]
->             .shuffle()
->             .select(range(args.dataset_max_samples if args.dataset_max_samples > 0 else (raw_dataset[split]).num_rows))
->         )
-391,397c527,534
-<     if args.jit:
-<         jit_input_texts = ["enable jit"]
-<         jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
-<         torch._C._jit_set_texpr_fuser_enabled(False)
-<         model.config.return_dict = False
-<         if hasattr(model, "forward"):
-<             sig = inspect.signature(model.forward)
----
->         if args.column_name is None:
->             # If no column name is given, take the first column that has strings
->             column_name = [key for key in raw_dataset.features.keys() if raw_dataset.features[key].dtype == "string"][
->                 0
->             ]
->             logger.info(
->                 f"No column name was given so automatically choosing '{column_name}' for prompts. If you would like to use another column of the dataset, you can set the argument `--column_name`."
->             )
-399,437c536,556
-<             sig = inspect.signature(model.__call__)
-<         jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
-<         traced_model = torch.jit.trace(model, jit_inputs, strict=False)
-<         traced_model = torch.jit.freeze(traced_model.eval())
-<         traced_model(*jit_inputs)
-<         traced_model(*jit_inputs)
-< 
-<         model = _ModelFallbackWrapper(traced_model, model)
-< 
-<     output_sequences = model.generate(
-<         input_ids=input_ids,
-<         max_length=args.length + len(encoded_prompt[0]),
-<         temperature=args.temperature,
-<         top_k=args.k,
-<         top_p=args.p,
-<         repetition_penalty=args.repetition_penalty,
-<         do_sample=True,
-<         num_return_sequences=args.num_return_sequences,
-<     )
-< 
-<     # Remove the batch dimension when returning multiple sequences
-<     if len(output_sequences.shape) > 2:
-<         output_sequences.squeeze_()
-< 
-<     generated_sequences = []
-< 
-<     for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
-<         print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
-<         generated_sequence = generated_sequence.tolist()
-< 
-<         # Decode text
-<         text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
-< 
-<         # Remove all text after the stop token
-<         text = text[: text.find(args.stop_token) if args.stop_token else None]
-< 
-<         # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
-<         total_sequence = (
-<             prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
----
->             column_name = args.column_name
-> 
->         # Remove unused columns
->         raw_dataset = raw_dataset.remove_columns([name for name in raw_dataset.column_names if name != column_name])
-> 
->         # Set the prompt length to args.max_input_tokens if > 0 else (if 0 truncate to 16, otherwise use full length)
->         prompt_length = args.max_input_tokens if args.max_input_tokens > 0 else (-1, 16)[args.max_input_tokens == 0]
-> 
->         def preprocess_function(examples):
->             # Tokenize the texts
->             return tokenizer(
->                 examples[column_name],
->                 padding="max_length",
->                 max_length=prompt_length if prompt_length > 0 else None,
->                 truncation=prompt_length > 0,
->             )
-> 
->         raw_dataset = raw_dataset.map(
->             preprocess_function,
->             batched=True,
->             desc="Running tokenizer on dataset",
-438a558,640
->         # After tokenization, we can remove the column of interest
->         raw_dataset = raw_dataset.remove_columns([column_name])
->         raw_dataset.set_format(type="torch")
-> 
->         if prompt_length <= 0:
->             # Todo please check if this collate function is suitable for your model
->             # This has been tested for OPT, llama, and Bloom
->             assert model.config.model_type in ["opt", "bloom", "llama"]
-> 
->             def collate_fn(data):
->                 collect = {k: [dt[k] for dt in data] for k in data[0]}
->                 result = {}
->                 for k in collect:
->                     tensors = collect[k]
->                     max_shape = max([item.shape[0] for item in tensors])
->                     result[k] = torch.stack(
->                         [torch.cat((torch.zeros(max_shape - t.shape[0], dtype=t.dtype), t)) for t in tensors], 0
->                     )
->                 return result
-> 
->         else:
->             collate_fn = None
-> 
->         dataloader = DataLoader(raw_dataset, batch_size=args.batch_size, collate_fn=collate_fn)
-> 
->         def generate_dataset(batch):
->             prompt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
->             # Move inputs to target device(s)
->             for t in batch:
->                 if torch.is_tensor(batch[t]):
->                     batch[t] = batch[t].to(args.device)
->             # Generate new sequences
->             outputs = model.generate(
->                 **batch,
->                 generation_config=generation_config,
->                 lazy_mode=use_lazy_mode,
->                 hpu_graphs=args.use_hpu_graphs,
->                 profiling_steps=args.profiling_steps,
->                 profiling_warmup_steps=args.profiling_warmup_steps,
->                 ignore_eos=args.ignore_eos,
->                 profiling_record_shapes=args.profiling_record_shapes,
->             ).cpu()
->             return prompt, outputs
-> 
->         # warmup
->         if prompt_length > 0:
->             from optimum.habana.utils import HabanaProfile
-> 
->             # compilation stage disable profiling
->             HabanaProfile.disable()
->             # Compilation
->             logger.info("Graph compilation...")
->             t0 = time.perf_counter()
->             for i, batch in enumerate(dataloader):
->                 generate_dataset(batch)
->                 # The first three iterations take longer because of graph compilation
->                 if (i + 1) == 3:
->                     break
->             torch_hpu.synchronize()
->             compilation_duration = time.perf_counter() - t0
->             HabanaProfile.enable()
-> 
->         total_new_tokens_generated = 0
->         duration = 0
->         separator = "-" * 50
->         logger.info("Running generate dataset...")
->         t_start = time.time()
->         for i, batch in enumerate(dataloader):
->             t0 = time.perf_counter()
->             prompt, outputs = generate_dataset(batch)
->             duration += time.perf_counter() - t0
->             total_new_tokens_generated += args.batch_size * args.max_new_tokens
->             print(separator)
->             print(f"Batch n°{i+1}")
->             print(f"Input: {prompt[:args.batch_size]}")
->             print(
->                 f"Output: {tokenizer.batch_decode(outputs, skip_special_tokens=True)[:args.batch_size*args.num_return_sequences]}"
->             )
->             print(separator)
->         t_end = time.time()
-> 
->         throughput = total_new_tokens_generated / duration
->         # Print Stats
-440,441c642,660
-<         generated_sequences.append(total_sequence)
-<         print(total_sequence)
----
->         stats = f"Throughput (including tokenization) = {throughput} tokens/second"
->         separator = "-" * len(stats)
->         print()
->         print("Stats:")
->         print(separator)
->         print(stats)
->         print("Total runtime for dataset:", t_end - t_start)
->         mem = get_hpu_memory_stats()
->         for k, v in mem.items():
->             print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
->         if prompt_length > 0:
->             print(f"Graph compilation duration          = {compilation_duration} seconds")
->         print(separator)
->     if args.quant_config:
->         import habana_quantization_toolkit
-> 
->         habana_quantization_toolkit.finish_measurements(model)
->     if args.const_serialization_path and os.path.isdir(args.const_serialization_path):
->         import shutil
-443c662
-<     return generated_sequences
----
->         shutil.rmtree(args.const_serialization_path)
diff --git a/tests/example_diff/run_glue.txt b/tests/example_diff/run_glue.txt
deleted file mode 100644
index 78cafcf01c..0000000000
--- a/tests/example_diff/run_glue.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-29,30d28
-< from datasets import load_dataset
-< 
-31a30
-> from datasets import load_dataset
-40,41d38
-<     Trainer,
-<     TrainingArguments,
-43d39
-<     set_seed,
-48a45,54
-> from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-> from optimum.habana.utils import set_seed
-> 
-> 
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-50,51c56,61
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
----
-> 
-> logger = logging.getLogger(__name__)
-> 
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-67,68d76
-< logger = logging.getLogger(__name__)
-< 
-143a152,155
->     problem_type: Optional[str] = field(
->         default="single_label_classification",
->         metadata={"help": "Problem type, such as single_label_classification or multi_label_classification"},
->     )
-213a226,229
->     add_pad_token: bool = field(
->         default=False,
->         metadata={"help": "Will add `pad_token` to tokenizer and model's config as `eos_token` if it's not defined."},
->     )
-221c237
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-250a267,273
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-251a275
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-253,254c277,279
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-375a401
->         problem_type=data_args.problem_type,
-416a443,447
->     if model_args.add_pad_token:
->         if not model.config.pad_token_id and not tokenizer.pad_token:
->             tokenizer.pad_token = tokenizer.eos_token
->             model.config.pad_token_id = tokenizer.eos_token_id
-> 
-527c558
-<     trainer = Trainer(
----
->     trainer = GaudiTrainer(
-528a560
->         gaudi_config=gaudi_config,
-628,632d659
-< 
-< 
-< def _mp_fn(index):
-<     # For xla_spawn (TPUs)
-<     main()
diff --git a/tests/example_diff/run_image_classification.txt b/tests/example_diff/run_image_classification.txt
deleted file mode 100644
index 0dbbe3f6c2..0000000000
--- a/tests/example_diff/run_image_classification.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-14a15,16
-> # limitations under the License.
-> """Fine-tuning a 🤗 Transformers model for image classification"""
-24a27
-> import transformers
-37,38d39
-< 
-< import transformers
-45,47d45
-<     Trainer,
-<     TrainingArguments,
-<     set_seed,
-52a51,60
-> from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-> from optimum.habana.utils import set_seed
-> 
-> 
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-54d61
-< """ Fine-tuning a 🤗 Transformers model for image classification"""
-58,59c65,67
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
----
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-184c192
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-212a221,227
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-213a229
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-215,216c231,233
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-392c409
-<     trainer = Trainer(
----
->     trainer = GaudiTrainer(
-393a411
->         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_mlm.txt b/tests/example_diff/run_mlm.txt
deleted file mode 100644
index 372a913834..0000000000
--- a/tests/example_diff/run_mlm.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-17,19c17,18
-< Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset.
-< 
-< Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
----
-> Training the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset.
-> Here is the full list of checkpoints on the hub that can be trained by this script:
-35,36d33
-< from datasets import load_dataset
-< 
-37a35
-> from datasets import load_dataset
-46,49d43
-<     Trainer,
-<     TrainingArguments,
-<     is_torch_xla_available,
-<     set_seed,
-54a49,50
-> from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-> from optimum.habana.utils import set_seed
-56,57d51
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
-59c53,59
-< require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
----
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-61a62,69
-> 
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-> 
-> require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-> 
-> 
-137c145
-<             "choices": ["auto", "bfloat16", "float16", "float32"],
----
->             "choices": ["auto", "bfloat16", "float32"],
-145c153
-<                 "set True will benefit LLM loading time and RAM consumption."
----
->                 "Setting it to True will benefit LLM loading time and RAM consumption."
-230c238
-<     streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
----
->     streaming: bool = field(default=False, metadata={"help": "Enable streaming mode."})
-254c262
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-283a292,298
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-284a300
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-286,287c302,304
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-289d305
-<     # Set the verbosity to info of the Transformers logger (on main process only):
-617c633
-<     trainer = Trainer(
----
->     trainer = GaudiTrainer(
-618a635
->         gaudi_config=gaudi_config,
-624,627c641,642
-<         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
-<         preprocess_logits_for_metrics=preprocess_logits_for_metrics
-<         if training_args.do_eval and not is_torch_xla_available()
-<         else None,
----
->         compute_metrics=compute_metrics if training_args.do_eval else None,
->         preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
-641,644c656,662
-<         max_train_samples = (
-<             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-<         )
-<         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
----
->         if data_args.streaming:
->             metrics["train_samples"] = training_args.max_steps * training_args.per_device_train_batch_size
->         else:
->             max_train_samples = (
->                 data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
->             )
->             metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-653d670
-< 
-656,657c673,678
-<         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-<         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
----
->         if not data_args.streaming:
->             max_eval_samples = (
->                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
->             )
->             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-> 
-680,684d700
-< 
-< 
-< def _mp_fn(index):
-<     # For xla_spawn (TPUs)
-<     main()
diff --git a/tests/example_diff/run_qa.txt b/tests/example_diff/run_qa.txt
deleted file mode 100644
index 118add46a1..0000000000
--- a/tests/example_diff/run_qa.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-3c3
-< # Copyright 2020 The HuggingFace Team All rights reserved.
----
-> # Copyright 2022 The HuggingFace Team All rights reserved.
-29a30
-> import transformers
-32,34d32
-< from utils_qa import postprocess_qa_predictions
-< 
-< import transformers
-43d40
-<     TrainingArguments,
-45d41
-<     set_seed,
-49a46
-> from utils_qa import postprocess_qa_predictions
-50a48,49
-> from optimum.habana import GaudiConfig, GaudiTrainingArguments
-> from optimum.habana.utils import set_seed
-52,53d50
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
-55c52,58
-< require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
----
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-58a62,67
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-> 
-> require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
-> 
-146c155
-<                 " batching to the maximum length in the batch (which can be faster on GPU but will be slower on TPU)."
----
->                 " batching to the maximum length in the batch (which can be faster on GPU but will be slower on HPU)."
-233c242
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-262a272,278
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-263a280
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-265,266c282,284
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-346a365,368
->     if config.model_type == "llama":
->         if tokenizer.pad_token is None:
->             tokenizer.add_special_tokens({"pad_token": "[PAD]"})
->         tokenizer.cls_token = tokenizer.bos_token
-637a660
->         gaudi_config=gaudi_config,
-706,710d728
-< 
-< 
-< def _mp_fn(index):
-<     # For xla_spawn (TPUs)
-<     main()
diff --git a/tests/example_diff/run_seq2seq_qa.txt b/tests/example_diff/run_seq2seq_qa.txt
deleted file mode 100644
index 817c72b5a9..0000000000
--- a/tests/example_diff/run_seq2seq_qa.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-29a30
-> import transformers
-32,33d32
-< 
-< import transformers
-40,41d38
-<     Seq2SeqTrainingArguments,
-<     set_seed,
-46a44,45
-> from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainingArguments
-> from optimum.habana.utils import set_seed
-48,49d46
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
-51c48,54
-< require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
----
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-54a58,63
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-> 
-> require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
-> 
-178c187
-<                 " batching to the maximum length in the batch (which can be faster on GPU but will be slower on TPU)."
----
->                 " batching to the maximum length in the batch (which can be faster on GPU but will be slower on HPU)."
-278c287
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiSeq2SeqTrainingArguments))
-307a317,323
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-308a325
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-310,311c327,329
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-660a679
->         gaudi_config=gaudi_config,
-734,738d752
-< 
-< 
-< def _mp_fn(index):
-<     # For xla_spawn (TPUs)
-<     main()
diff --git a/tests/example_diff/run_speech_recognition_ctc.txt b/tests/example_diff/run_speech_recognition_ctc.txt
deleted file mode 100644
index 1fab0abcf2..0000000000
--- a/tests/example_diff/run_speech_recognition_ctc.txt
+++ /dev/null
@@ -1,130 +0,0 @@
-32,33d31
-< from datasets import DatasetDict, load_dataset
-< 
-34a33
-> from datasets import DatasetDict, load_dataset
-42,43d40
-<     Trainer,
-<     TrainingArguments,
-45d41
-<     set_seed,
-50a47,48
-> from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
-> from optimum.habana.utils import set_seed
-52,53d49
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
-55c51,56
-< require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
----
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-59a61,66
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-> 
-> require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
-> 
-144c151
-<             "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very"
----
->             "help": "Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very "
-154d160
-< 
-400c406
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiTrainingArguments))
-434a441,446
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         token=data_args.token,
->     )
-> 
-435a448
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-437,438c450,452
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-451,457c465,470
-<     if training_args.do_train:
-<         raw_datasets["train"] = load_dataset(
-<             data_args.dataset_name,
-<             data_args.dataset_config_name,
-<             split=data_args.train_split_name,
-<             token=data_args.token,
-<         )
----
->     raw_datasets["train"] = load_dataset(
->         data_args.dataset_name,
->         data_args.dataset_config_name,
->         split=data_args.train_split_name,
->         token=data_args.token,
->     )
-459,464c472,477
-<         if data_args.audio_column_name not in raw_datasets["train"].column_names:
-<             raise ValueError(
-<                 f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
-<                 " Make sure to set `--audio_column_name` to the correct audio column - one of"
-<                 f" {', '.join(raw_datasets['train'].column_names)}."
-<             )
----
->     if data_args.audio_column_name not in raw_datasets["train"].column_names:
->         raise ValueError(
->             f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
->             " Make sure to set `--audio_column_name` to the correct audio column - one of"
->             f" {', '.join(raw_datasets['train'].column_names)}."
->         )
-466,471c479,484
-<         if data_args.text_column_name not in raw_datasets["train"].column_names:
-<             raise ValueError(
-<                 f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
-<                 "Make sure to set `--text_column_name` to the correct text column - one of "
-<                 f"{', '.join(raw_datasets['train'].column_names)}."
-<             )
----
->     if data_args.text_column_name not in raw_datasets["train"].column_names:
->         raise ValueError(
->             f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
->             "Make sure to set `--text_column_name` to the correct text column - one of "
->             f"{', '.join(raw_datasets['train'].column_names)}."
->         )
-473,474c486,487
-<         if data_args.max_train_samples is not None:
-<             raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
----
->     if data_args.max_train_samples is not None:
->         raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
-492c505
-<         f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
----
->         f'[{"".join(data_args.chars_to_ignore).replace(" ", "")}]' if data_args.chars_to_ignore is not None else None
-631a645,649
->         raise RuntimeError(
->             f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one"
->             f" ({feature_extractor.sampling_rate}).Data resampling should be done. The Datasets library does not"
->             " support it on HPUs yet."
->         )
-741c759,762
-<         processor=processor, feature_extractor_input_name=feature_extractor_input_name
----
->         processor=processor,
->         feature_extractor_input_name=feature_extractor_input_name,
->         pad_to_multiple_of=int(max_input_length),
->         pad_to_multiple_of_labels=500,
-745c766
-<     trainer = Trainer(
----
->     trainer = GaudiTrainer(
-746a768
->         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_speech_recognition_seq2seq.txt b/tests/example_diff/run_speech_recognition_seq2seq.txt
deleted file mode 100644
index 45b00bef9b..0000000000
--- a/tests/example_diff/run_speech_recognition_seq2seq.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-31,32d30
-< from datasets import DatasetDict, load_dataset
-< 
-33a32
-> from datasets import DatasetDict, load_dataset
-41,43d39
-<     Seq2SeqTrainer,
-<     Seq2SeqTrainingArguments,
-<     set_seed,
-48a45,55
-> from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainer, GaudiSeq2SeqTrainingArguments
-> from optimum.habana.utils import set_seed
-> 
-> 
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-51c58,59
-< check_min_version("4.42.0.dev0")
----
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-230a239,242
->     label_features_max_length: int = field(
->         default=None,
->         metadata={"help": "Max length for padding label features."},
->     )
-248a261
->     label_features_max_length: int
-262c275,279
-<         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
----
->         kwargs = {}
->         if self.label_features_max_length is not None:
->             kwargs["padding"] = "max_length"
->             kwargs["max_length"] = self.label_features_max_length
->         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt", **kwargs)
-282c299
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiSeq2SeqTrainingArguments))
-309a327,332
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         token=model_args.token,
->     )
-> 
-310a334
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-312,313c336,338
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-442d466
-<         model.generation_config.forced_decoder_ids = model_args.forced_decoder_ids
-456a481,484
->         logger.warning(
->             f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one"
->             f" ({feature_extractor.sampling_rate}).Data resampling should be done."
->         )
-561a590
->         label_features_max_length=data_args.label_features_max_length,
-565c594
-<     trainer = Seq2SeqTrainer(
----
->     trainer = GaudiSeq2SeqTrainer(
-566a596
->         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_summarization.txt b/tests/example_diff/run_summarization.txt
deleted file mode 100644
index 9f01193b14..0000000000
--- a/tests/example_diff/run_summarization.txt
+++ /dev/null
@@ -1,213 +0,0 @@
-3c3
-< # Copyright 2021 The HuggingFace Team. All rights reserved.
----
-> # Copyright 2022 The HuggingFace Team. All rights reserved.
-20a21
-> import copy
-30a32,33
-> import torch
-> import transformers
-33,34d35
-< 
-< import transformers
-45,47c46
-<     Seq2SeqTrainer,
-<     Seq2SeqTrainingArguments,
-<     set_seed,
----
->     default_data_collator,
-48a48
-> from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-52a53,54
-> from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainer, GaudiSeq2SeqTrainingArguments
-> from optimum.habana.utils import set_seed
-54,55d55
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
-57c57,63
-< require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
----
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-60a67,72
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-> 
-> require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
-> 
-129a142,150
->     use_cache: bool = field(
->         default=True,
->         metadata={
->             "help": (
->                 "Whether or not the model should return the last key/values attentions (not used by all models)."
->                 "Only relevant if `config.is_decoder=True`."
->             )
->         },
->     )
-213c234
-<                 "efficient on GPU but very bad for TPU."
----
->                 "efficient on GPU but very bad for HPU in lazy mode."
-261a283
->     source_suffix: Optional[str] = field(default="", metadata={"help": "A suffix to add after every source text."})
-317c339
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiSeq2SeqTrainingArguments))
-346a369,375
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-347a377
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-349,350c379,381
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-431a463
->         use_cache=False if training_args.gradient_checkpointing else model_args.use_cache,
-450a483,488
->     is_bart = model.config.model_type == "bart"
->     if is_bart and training_args.do_train:
->         raise ValueError(
->             "Training is not yet supported for BART. Eval or predict can be enabled with `--do_eval` and `--do_predict`."
->         )
-> 
-453c491,498
-<     embedding_size = model.get_input_embeddings().weight.shape[0]
----
->     embeddings = model.get_input_embeddings()
->     if is_deepspeed_zero3_enabled():
->         import deepspeed
-> 
->         with deepspeed.zero.GatheredParameters(embeddings.weight, modifier_rank=None):
->             embedding_size = embeddings.weight.shape[0]
->     else:
->         embedding_size = embeddings.weight.shape[0]
-486a532
->     suffix = data_args.source_suffix if data_args.source_suffix is not None else ""
-557a604,605
->             else:
->                 raise ValueError("Found case where either text or summary is missing.")
-559c607
-<         inputs = [prefix + inp for inp in inputs]
----
->         inputs = [prefix + inp + suffix for inp in inputs]
-574a623,662
->     def preprocess_bucketing_function(examples):
->         # remove pairs where at least one record is None
-> 
->         inputs, targets = [], []
->         for i in range(len(examples[text_column])):
->             if examples[text_column][i] and examples[summary_column][i]:
->                 inputs.append(examples[text_column][i])
->                 targets.append(examples[summary_column][i])
->             else:
->                 raise ValueError("Found case where either text or summary is missing.")
-> 
->         inputs = [prefix + inp + suffix for inp in inputs]
->         model_inputs = tokenizer(inputs, return_tensors="pt", padding=True)
->         new_model_inputs = {"input_ids": []}
->         for i in range(len(model_inputs["input_ids"])):
->             cur_len = model_inputs["input_ids"][i].shape[-1]
->             max_length = (cur_len + 128 - 1) // 128 * 128
->             if max_length > data_args.max_source_length:
->                 max_length = data_args.max_source_length
->                 new_model_inputs["input_ids"].append(model_inputs["input_ids"][i][:max_length])
->             else:
->                 new_model_inputs["input_ids"].append(
->                     torch.nn.functional.pad(
->                         model_inputs["input_ids"][i], (0, max_length - cur_len), value=tokenizer.pad_token_id
->                     )
->                 )
->         model_inputs = new_model_inputs
->         # Tokenize targets with the `text_target` keyword argument
->         labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
-> 
->         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
->         # padding in the loss.
->         if padding == "max_length" and data_args.ignore_pad_token_for_loss:
->             labels["input_ids"] = [
->                 [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
->             ]
-> 
->         model_inputs["labels"] = labels["input_ids"]
->         return model_inputs
-> 
-589a678,683
->     def wrapper_preprocess_function(examples):
->         if model.config.is_encoder_decoder:
->             return preprocess_bucketing_function(examples)
->         else:
->             return preprocess_function(examples)
-> 
-598c692
-<                 preprocess_function,
----
->                 wrapper_preprocess_function,
-614c708
-<                 preprocess_function,
----
->                 wrapper_preprocess_function,
-624,629c718,726
-<     data_collator = DataCollatorForSeq2Seq(
-<         tokenizer,
-<         model=model,
-<         label_pad_token_id=label_pad_token_id,
-<         pad_to_multiple_of=8 if training_args.fp16 else None,
-<     )
----
->     if data_args.pad_to_max_length:
->         data_collator = default_data_collator
->     else:
->         data_collator = DataCollatorForSeq2Seq(
->             tokenizer,
->             model=model,
->             label_pad_token_id=label_pad_token_id,
->             pad_to_multiple_of=8 if training_args.fp16 else None,
->         )
-664,671c761,769
-<     training_args.generation_max_length = (
-<         training_args.generation_max_length
-<         if training_args.generation_max_length is not None
-<         else data_args.val_max_target_length
-<     )
-<     training_args.generation_num_beams = (
-<         data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
-<     )
----
->     training_args.generation_config = copy.deepcopy(model.generation_config)
->     if training_args.generation_max_length is not None:
->         training_args.generation_config.max_length = training_args.generation_max_length
->     else:
->         training_args.generation_config.max_length = data_args.val_max_target_length
->     if data_args.num_beams is not None:
->         training_args.generation_config.num_beams = data_args.num_beams
->     elif training_args.generation_num_beams is not None:
->         training_args.generation_config.num_beams = training_args.generation_num_beams
-674c772
-<     trainer = Seq2SeqTrainer(
----
->     trainer = GaudiSeq2SeqTrainer(
-675a774
->         gaudi_config=gaudi_config,
-764,768d862
-< 
-< 
-< def _mp_fn(index):
-<     # For xla_spawn (TPUs)
-<     main()
diff --git a/tests/example_diff/run_translation.txt b/tests/example_diff/run_translation.txt
deleted file mode 100644
index 1aa504c06f..0000000000
--- a/tests/example_diff/run_translation.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-30,31d29
-< from datasets import load_dataset
-< 
-32a31
-> from datasets import load_dataset
-44,45c43
-<     Seq2SeqTrainer,
-<     Seq2SeqTrainingArguments,
----
->     NllbTokenizerFast,
-47d44
-<     set_seed,
-52a50,51
-> from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainer, GaudiSeq2SeqTrainingArguments
-> from optimum.habana.utils import set_seed
-54,55d52
-< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
-57c54,60
-< require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
----
-> try:
->     from optimum.habana.utils import check_optimum_habana_min_version
-> except ImportError:
-> 
->     def check_optimum_habana_min_version(*a, **b):
->         return ()
-> 
-60a64,69
-> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-> check_min_version("4.40.0")
-> check_optimum_habana_min_version("1.11.0")
-> 
-> require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
-> 
-62c71,78
-< MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast, M2M100Tokenizer]
----
-> MULTILINGUAL_TOKENIZERS = [
->     MBartTokenizer,
->     MBartTokenizerFast,
->     MBart50Tokenizer,
->     MBart50TokenizerFast,
->     M2M100Tokenizer,
->     NllbTokenizerFast,
-> ]
-110a127,135
->     use_cache: bool = field(
->         default=True,
->         metadata={
->             "help": (
->                 "Whether or not the model should return the last key/values attentions (not used by all models)."
->                 "Only relevant if `config.is_decoder=True`."
->             )
->         },
->     )
-181c206
-<                 "efficient on GPU but very bad for TPU."
----
->                 "efficient on GPU but very bad for HPU in lazy mode."
-266c291
-<     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
----
->     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, GaudiSeq2SeqTrainingArguments))
-295a321,327
->     gaudi_config = GaudiConfig.from_pretrained(
->         training_args.gaudi_config_name,
->         cache_dir=model_args.cache_dir,
->         revision=model_args.model_revision,
->         token=model_args.token,
->     )
-> 
-296a329
->     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
-298,299c331,333
-<         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-<         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
----
->         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
->         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
->         + f"mixed-precision training: {mixed_precision}"
-384a419
->         use_cache=False if training_args.gradient_checkpointing else model_args.use_cache,
-456c491
-<     # Check the whether the source target length fits in the model, if it has absolute positional embeddings
----
->     # Check whether the source target length fits in the model, if it has absolute positional embeddings
-594c629
-<     trainer = Seq2SeqTrainer(
----
->     trainer = GaudiSeq2SeqTrainer(
-595a631
->         gaudi_config=gaudi_config,
-688,692d723
-< 
-< 
-< def _mp_fn(index):
-<     # For xla_spawn (TPUs)
-<     main()
diff --git a/tests/resource/img/image-captioning-example.png b/tests/resource/img/image-captioning-example.png
deleted file mode 100644
index 70f08c0a43..0000000000
Binary files a/tests/resource/img/image-captioning-example.png and /dev/null differ
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
deleted file mode 100755
index e9628620e7..0000000000
--- a/tests/test_diffusers.py
+++ /dev/null
@@ -1,2211 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import random
-import re
-import subprocess
-import tempfile
-from io import BytesIO
-from pathlib import Path
-from typing import Union
-from unittest import TestCase, skipUnless
-
-import numpy as np
-import pytest
-import requests
-import safetensors
-import torch
-from diffusers import (
-    AutoencoderKL,
-    AutoencoderKLTemporalDecoder,
-    ControlNetModel,
-    UNet2DConditionModel,
-    UNetSpatioTemporalConditionModel,
-    UniPCMultistepScheduler,
-)
-from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
-from diffusers.utils import load_image, numpy_to_pil
-from diffusers.utils.testing_utils import floats_tensor
-from diffusers.utils.torch_utils import randn_tensor
-from huggingface_hub import snapshot_download
-from parameterized import parameterized
-from PIL import Image
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-from transformers.testing_utils import parse_flag_from_env, slow
-
-from optimum.habana import GaudiConfig
-from optimum.habana.diffusers import (
-    GaudiDDIMScheduler,
-    GaudiDiffusionPipeline,
-    GaudiEulerAncestralDiscreteScheduler,
-    GaudiEulerDiscreteScheduler,
-    GaudiStableDiffusionControlNetPipeline,
-    GaudiStableDiffusionLDM3DPipeline,
-    GaudiStableDiffusionPipeline,
-    GaudiStableDiffusionUpscalePipeline,
-    GaudiStableDiffusionXLPipeline,
-    GaudiStableVideoDiffusionPipeline,
-)
-from optimum.habana.utils import set_seed
-
-from .clip_coco_utils import calculate_clip_score, download_files
-
-
-IS_GAUDI2 = os.environ.get("GAUDI2_CI", "0") == "1"
-
-
-if IS_GAUDI2:
-    THROUGHPUT_BASELINE_BF16 = 1.086
-    THROUGHPUT_BASELINE_AUTOCAST = 0.394
-    TEXTUAL_INVERSION_THROUGHPUT = 104.29806
-    TEXTUAL_INVERSION_RUNTIME = 114.1344320399221
-    CONTROLNET_THROUGHPUT = 92.886919836857
-    CONTROLNET_RUNTIME = 537.4276602957398
-else:
-    THROUGHPUT_BASELINE_BF16 = 0.309
-    THROUGHPUT_BASELINE_AUTOCAST = 0.114
-    TEXTUAL_INVERSION_THROUGHPUT = 60.5991479573174
-    TEXTUAL_INVERSION_RUNTIME = 196.43840550999994
-    CONTROLNET_THROUGHPUT = 44.7278034963213
-    CONTROLNET_RUNTIME = 1116.084316640001
-
-
-_run_custom_bf16_ops_test_ = parse_flag_from_env("CUSTOM_BF16_OPS", default=False)
-
-
-def custom_bf16_ops(test_case):
-    """
-    Decorator marking a test as needing custom bf16 ops.
-    Custom bf16 ops must be declared before `habana_frameworks.torch.core` is imported, which is not possible if some other tests are executed before.
-
-    Such tests are skipped by default. Set the CUSTOM_BF16_OPS environment variable to a truthy value to run them.
-
-    """
-    return skipUnless(_run_custom_bf16_ops_test_, "test requires custom bf16 ops")(test_case)
-
-
-class GaudiPipelineUtilsTester(TestCase):
-    """
-    Tests the features added on top of diffusers/pipeline_utils.py.
-    """
-
-    def test_use_hpu_graphs_raise_error_without_habana(self):
-        with self.assertRaises(ValueError):
-            _ = GaudiDiffusionPipeline(
-                use_habana=False,
-                use_hpu_graphs=True,
-            )
-
-    def test_gaudi_config_raise_error_without_habana(self):
-        with self.assertRaises(ValueError):
-            _ = GaudiDiffusionPipeline(
-                use_habana=False,
-                gaudi_config=GaudiConfig(),
-            )
-
-    def test_device(self):
-        pipeline_1 = GaudiDiffusionPipeline(
-            use_habana=True,
-            gaudi_config=GaudiConfig(),
-        )
-        self.assertEqual(pipeline_1._device.type, "hpu")
-
-        pipeline_2 = GaudiDiffusionPipeline(
-            use_habana=False,
-        )
-        self.assertEqual(pipeline_2._device.type, "cpu")
-
-    def test_gaudi_config_types(self):
-        # gaudi_config is a string
-        _ = GaudiDiffusionPipeline(
-            use_habana=True,
-            gaudi_config="Habana/stable-diffusion",
-        )
-
-        # gaudi_config is instantiated beforehand
-        gaudi_config = GaudiConfig.from_pretrained("Habana/stable-diffusion")
-        _ = GaudiDiffusionPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-        )
-
-    def test_default(self):
-        pipeline = GaudiDiffusionPipeline(
-            use_habana=True,
-            gaudi_config=GaudiConfig(),
-        )
-
-        self.assertTrue(hasattr(pipeline, "htcore"))
-
-    def test_use_hpu_graphs(self):
-        pipeline = GaudiDiffusionPipeline(
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config=GaudiConfig(),
-        )
-
-        self.assertTrue(hasattr(pipeline, "ht"))
-        self.assertTrue(hasattr(pipeline, "hpu_stream"))
-        self.assertTrue(hasattr(pipeline, "cache"))
-
-    def test_save_pretrained(self):
-        model_name = "hf-internal-testing/tiny-stable-diffusion-torch"
-        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-        pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            model_name,
-            scheduler=scheduler,
-            use_habana=True,
-            gaudi_config=GaudiConfig(),
-        )
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pipeline.save_pretrained(tmp_dir)
-            self.assertTrue(Path(tmp_dir, "gaudi_config.json").is_file())
-
-
-class GaudiStableDiffusionPipelineTester(TestCase):
-    """
-    Tests the StableDiffusionPipeline for Gaudi.
-    """
-
-    def get_dummy_components(self, time_cond_proj_dim=None):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(4, 8),
-            layers_per_block=1,
-            sample_size=32,
-            time_cond_proj_dim=time_cond_proj_dim,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            norm_num_groups=2,
-        )
-        scheduler = GaudiDDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[4, 8],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=64,
-            layer_norm_eps=1e-05,
-            num_attention_heads=8,
-            num_hidden_layers=3,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_ddim(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig(use_torch_autocast=False)
-
-        sd_pipe = GaudiStableDiffusionPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images[0]
-
-        image_slice = image[-3:, -3:, -1]
-
-        self.assertEqual(image.shape, (64, 64, 3))
-        expected_slice = np.array([0.3203, 0.4555, 0.4711, 0.3505, 0.3973, 0.4650, 0.5137, 0.3392, 0.4045])
-
-        self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-2)
-
-    def test_stable_diffusion_no_safety_checker(self):
-        gaudi_config = GaudiConfig()
-        scheduler = GaudiDDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        pipe = GaudiStableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-pipe",
-            scheduler=scheduler,
-            safety_checker=None,
-            use_habana=True,
-            gaudi_config=gaudi_config,
-        )
-        self.assertIsInstance(pipe, GaudiStableDiffusionPipeline)
-        self.assertIsInstance(pipe.scheduler, GaudiDDIMScheduler)
-        self.assertIsNone(pipe.safety_checker)
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        self.assertIsNotNone(image)
-
-        # Check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = GaudiStableDiffusionPipeline.from_pretrained(
-                tmpdirname,
-                use_habana=True,
-                gaudi_config=tmpdirname,
-            )
-
-        # Sanity check that the pipeline still works
-        self.assertIsNone(pipe.safety_checker)
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        self.assertIsNotNone(image)
-
-    @parameterized.expand(["pil", "np", "latent"])
-    def test_stable_diffusion_output_types(self, output_type):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        num_prompts = 2
-        num_images_per_prompt = 3
-
-        outputs = sd_pipe(
-            num_prompts * [prompt],
-            num_images_per_prompt=num_images_per_prompt,
-            num_inference_steps=2,
-            output_type=output_type,
-        )
-
-        self.assertEqual(len(outputs.images), 2 * 3)
-        # TODO: enable safety checker
-        # if output_type == "latent":
-        #     self.assertIsNone(outputs.nsfw_content_detected)
-        # else:
-        #     self.assertEqual(len(outputs.nsfw_content_detected), 2 * 3)
-
-    # TODO: enable this test when PNDMScheduler is adapted to Gaudi
-    # def test_stable_diffusion_negative_prompt(self):
-    #     device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-    #     unet = self.dummy_cond_unet
-    #     scheduler = PNDMScheduler(skip_prk_steps=True)
-    #     vae = self.dummy_vae
-    #     bert = self.dummy_text_encoder
-    #     tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-    #     # make sure here that pndm scheduler skips prk
-    #     sd_pipe = StableDiffusionPipeline(
-    #         unet=unet,
-    #         scheduler=scheduler,
-    #         vae=vae,
-    #         text_encoder=bert,
-    #         tokenizer=tokenizer,
-    #         safety_checker=None,
-    #         feature_extractor=self.dummy_extractor,
-    #     )
-    #     sd_pipe = sd_pipe.to(device)
-    #     sd_pipe.set_progress_bar_config(disable=None)
-
-    #     prompt = "A painting of a squirrel eating a burger"
-    #     negative_prompt = "french fries"
-    #     generator = torch.Generator(device=device).manual_seed(0)
-    #     output = sd_pipe(
-    #         prompt,
-    #         negative_prompt=negative_prompt,
-    #         generator=generator,
-    #         guidance_scale=6.0,
-    #         num_inference_steps=2,
-    #         output_type="np",
-    #     )
-
-    #     image = output.images
-    #     image_slice = image[0, -3:, -3:, -1]
-
-    #     assert image.shape == (1, 128, 128, 3)
-    #     expected_slice = np.array([0.4851, 0.4617, 0.4765, 0.5127, 0.4845, 0.5153, 0.5141, 0.4886, 0.4719])
-    #     assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # Test num_images_per_prompt=1 (default)
-        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
-
-        self.assertEqual(len(images), 1)
-        self.assertEqual(images[0].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt=1 (default) for several prompts
-        num_prompts = 3
-        images = sd_pipe([prompt] * num_prompts, num_inference_steps=2, output_type="np").images
-
-        self.assertEqual(len(images), num_prompts)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt for several prompts
-        num_prompts = 2
-        images = sd_pipe(
-            [prompt] * num_prompts,
-            num_inference_steps=2,
-            output_type="np",
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_batch_sizes(self):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # Test num_images > 1 where num_images is a divider of the total number of generated images
-        batch_size = 3
-        num_images_per_prompt = batch_size**2
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Same test for several prompts
-        num_prompts = 3
-        images = sd_pipe(
-            [prompt] * num_prompts,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Test num_images when it is not a divider of the total number of generated images for a single prompt
-        num_images_per_prompt = 7
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Same test for several prompts
-        num_prompts = 2
-        images = sd_pipe(
-            [prompt] * num_prompts,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_bf16(self):
-        """Test that stable diffusion works with bf16"""
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images[0]
-
-        self.assertEqual(image.shape, (64, 64, 3))
-
-    def test_stable_diffusion_default(self):
-        components = self.get_dummy_components()
-
-        sd_pipe = GaudiStableDiffusionPipeline(
-            use_habana=True,
-            gaudi_config="Habana/stable-diffusion",
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        images = sd_pipe(
-            [prompt] * 2,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=3,
-            num_images_per_prompt=5,
-        ).images
-
-        self.assertEqual(len(images), 10)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_hpu_graphs(self):
-        components = self.get_dummy_components()
-
-        sd_pipe = GaudiStableDiffusionPipeline(
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config="Habana/stable-diffusion",
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        images = sd_pipe(
-            [prompt] * 2,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=3,
-            num_images_per_prompt=5,
-        ).images
-
-        self.assertEqual(len(images), 10)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    @slow
-    def test_no_throughput_regression_bf16(self):
-        prompts = [
-            "An image of a squirrel in Picasso style",
-            "High quality photo of an astronaut riding a horse in space",
-        ]
-        num_images_per_prompt = 11
-        batch_size = 4
-        model_name = "runwayml/stable-diffusion-v1-5"
-        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-        pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            model_name,
-            scheduler=scheduler,
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
-            torch_dtype=torch.bfloat16,
-        )
-        set_seed(27)
-        outputs = pipeline(
-            prompt=prompts,
-            num_images_per_prompt=num_images_per_prompt,
-            batch_size=batch_size,
-        )
-        self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
-        self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_BASELINE_BF16)
-
-    @custom_bf16_ops
-    @slow
-    def test_no_throughput_regression_autocast(self):
-        prompts = [
-            "An image of a squirrel in Picasso style",
-            "High quality photo of an astronaut riding a horse in space",
-        ]
-        num_images_per_prompt = 11
-        batch_size = 4
-        model_name = "stabilityai/stable-diffusion-2-1"
-        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-        pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            model_name,
-            scheduler=scheduler,
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion-2"),
-        )
-        set_seed(27)
-        outputs = pipeline(
-            prompt=prompts,
-            num_images_per_prompt=num_images_per_prompt,
-            batch_size=batch_size,
-            height=768,
-            width=768,
-        )
-        self.assertEqual(len(outputs.images), num_images_per_prompt * len(prompts))
-        self.assertGreaterEqual(outputs.throughput, 0.95 * THROUGHPUT_BASELINE_AUTOCAST)
-
-    @slow
-    def test_no_generation_regression(self):
-        seed = 27
-        set_seed(seed)
-        model_name = "CompVis/stable-diffusion-v1-4"
-        # fp32
-        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-        pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            model_name,
-            scheduler=scheduler,
-            safety_checker=None,
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config=GaudiConfig(use_torch_autocast=False),
-        )
-
-        prompt = "An image of a squirrel in Picasso style"
-        generator = torch.manual_seed(seed)
-        outputs = pipeline(
-            prompt=prompt,
-            generator=generator,
-            output_type="np",
-        )
-
-        if IS_GAUDI2:
-            target_score = 29.8925
-        else:
-            target_score = 36.774
-
-        image = outputs.images[0]
-        pil_image = numpy_to_pil(image)[0]
-        pil_image.save("test_no_generation_regression_output.png")
-
-        clip_score = calculate_clip_score(np.expand_dims(image, axis=0), [prompt])
-
-        self.assertEqual(image.shape, (512, 512, 3))
-        self.assertGreaterEqual(clip_score, target_score)
-
-    @slow
-    def test_no_generation_regression_ldm3d(self):
-        seed = 27
-        set_seed(seed)
-        model_name = "Intel/ldm3d-4c"
-        # fp32
-        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-        pipeline = GaudiStableDiffusionLDM3DPipeline.from_pretrained(
-            model_name,
-            scheduler=scheduler,
-            safety_checker=None,
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config=GaudiConfig(),
-        )
-
-        prompt = "An image of a squirrel in Picasso style"
-        generator = torch.manual_seed(seed)
-        outputs = pipeline(
-            prompt=prompt,
-            generator=generator,
-            output_type="np",
-        )
-
-        if IS_GAUDI2:
-            target_score = 28.0894
-        else:
-            target_score = 35.81
-
-        rgb = outputs.rgb[0]
-        depth = outputs.depth[0]
-
-        rgb_clip_score = calculate_clip_score(np.expand_dims(rgb, axis=0), [prompt])
-
-        self.assertEqual(rgb.shape, (512, 512, 3))
-        self.assertEqual(depth.shape, (512, 512, 1))
-        self.assertGreaterEqual(rgb_clip_score, target_score)
-
-    @slow
-    def test_no_generation_regression_upscale(self):
-        model_name = "stabilityai/stable-diffusion-x4-upscaler"
-        # fp32
-        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
-        pipeline = GaudiStableDiffusionUpscalePipeline.from_pretrained(
-            model_name,
-            scheduler=scheduler,
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config=GaudiConfig(use_torch_autocast=False),
-        )
-        set_seed(27)
-
-        url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
-        response = requests.get(url)
-        low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
-        low_res_img = low_res_img.resize((128, 128))
-        prompt = "a white cat"
-        upscaled_image = pipeline(prompt=prompt, image=low_res_img, output_type="np").images[0]
-        if IS_GAUDI2:
-            expected_slice = np.array(
-                [
-                    0.16527882,
-                    0.161616,
-                    0.15665859,
-                    0.1660901,
-                    0.1594379,
-                    0.14936888,
-                    0.1578255,
-                    0.15342498,
-                    0.14590919,
-                ]
-            )
-        else:
-            expected_slice = np.array(
-                [
-                    0.1652787,
-                    0.16161594,
-                    0.15665877,
-                    0.16608998,
-                    0.1594378,
-                    0.14936894,
-                    0.15782538,
-                    0.15342498,
-                    0.14590913,
-                ]
-            )
-        self.assertEqual(upscaled_image.shape, (512, 512, 3))
-        self.assertLess(np.abs(expected_slice - upscaled_image[-3:, -3:, -1].flatten()).max(), 5e-3)
-
-    @slow
-    def test_textual_inversion(self):
-        path_to_script = (
-            Path(os.path.dirname(__file__)).parent
-            / "examples"
-            / "stable-diffusion"
-            / "training"
-            / "textual_inversion.py"
-        )
-
-        with tempfile.TemporaryDirectory() as data_dir:
-            snapshot_download(
-                "diffusers/cat_toy_example", local_dir=data_dir, repo_type="dataset", ignore_patterns=".gitattributes"
-            )
-            with tempfile.TemporaryDirectory() as run_dir:
-                cmd_line = [
-                    "python3",
-                    f"{path_to_script.parent.parent.parent / 'gaudi_spawn.py'}",
-                    "--use_mpi",
-                    "--world_size",
-                    "8",
-                    f"{path_to_script}",
-                    "--pretrained_model_name_or_path runwayml/stable-diffusion-v1-5",
-                    f"--train_data_dir {data_dir}",
-                    '--learnable_property "object"',
-                    '--placeholder_token "<cat-toy>"',
-                    '--initializer_token "toy"',
-                    "--resolution 512",
-                    "--train_batch_size 4",
-                    "--max_train_steps 375",
-                    "--learning_rate 5.0e-04",
-                    "--scale_lr",
-                    '--lr_scheduler "constant"',
-                    "--lr_warmup_steps 0",
-                    f"--output_dir {run_dir}",
-                    "--save_as_full_pipeline",
-                    "--gaudi_config_name Habana/stable-diffusion",
-                    "--throughput_warmup_steps 3",
-                    "--seed 27",
-                ]
-                pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
-                cmd_line = [x for y in cmd_line for x in re.split(pattern, y) if x]
-
-                # Run textual inversion
-                p = subprocess.Popen(cmd_line)
-                return_code = p.wait()
-
-                # Ensure the run finished without any issue
-                self.assertEqual(return_code, 0)
-
-                # Assess throughput
-                with open(Path(run_dir) / "speed_metrics.json") as fp:
-                    results = json.load(fp)
-                self.assertGreaterEqual(results["train_samples_per_second"], 0.95 * TEXTUAL_INVERSION_THROUGHPUT)
-                self.assertLessEqual(results["train_runtime"], 1.05 * TEXTUAL_INVERSION_RUNTIME)
-
-                # Assess generated image
-                pipe = GaudiStableDiffusionPipeline.from_pretrained(
-                    run_dir,
-                    torch_dtype=torch.bfloat16,
-                    use_habana=True,
-                    use_hpu_graphs=True,
-                    gaudi_config=GaudiConfig(use_habana_mixed_precision=False),
-                )
-                prompt = "A <cat-toy> backpack"
-                set_seed(27)
-                image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5, output_type="np").images[0]
-
-                # TODO: see how to generate images in a reproducible way
-                # expected_slice = np.array(
-                #     [0.57421875, 0.5703125, 0.58203125, 0.58203125, 0.578125, 0.5859375, 0.578125, 0.57421875, 0.56640625]
-                # )
-                self.assertEqual(image.shape, (512, 512, 3))
-                # self.assertLess(np.abs(expected_slice - image[-3:, -3:, -1].flatten()).max(), 5e-3)
-
-
-class GaudiStableDiffusionXLPipelineTester(TestCase):
-    """
-    Tests the StableDiffusionXLPipeline for Gaudi.
-    """
-
-    def get_dummy_components(self, time_cond_proj_dim=None, timestep_spacing="leading"):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(2, 4),
-            layers_per_block=2,
-            time_cond_proj_dim=time_cond_proj_dim,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            # SD2-specific config below
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-            addition_embed_type="text_time",
-            addition_time_embed_dim=8,
-            transformer_layers_per_block=(1, 2),
-            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
-            cross_attention_dim=64,
-            norm_num_groups=1,
-        )
-        scheduler = GaudiEulerDiscreteScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            steps_offset=1,
-            beta_schedule="scaled_linear",
-            timestep_spacing=timestep_spacing,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=32,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
-        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "text_encoder_2": text_encoder_2,
-            "tokenizer_2": tokenizer_2,
-            "image_encoder": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 5.0,
-            "output_type": "np",
-        }
-        return inputs
-
-    def test_stable_diffusion_xl_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig(use_torch_autocast=False)
-        sd_pipe = GaudiStableDiffusionXLPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images[0]
-
-        image_slice = image[-3:, -3:, -1]
-
-        self.assertEqual(image.shape, (64, 64, 3))
-        expected_slice = np.array([0.5552, 0.5569, 0.4725, 0.4348, 0.4994, 0.4632, 0.5142, 0.5012, 0.47])
-
-        # The threshold should be 1e-2 below but it started failing
-        # from Diffusers v0.24. However, generated images still look similar.
-        self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-1)
-
-    def test_stable_diffusion_xl_euler_ancestral(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig(use_torch_autocast=False)
-        sd_pipe = GaudiStableDiffusionXLPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
-        sd_pipe.scheduler = GaudiEulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images[0]
-
-        image_slice = image[-3:, -3:, -1]
-
-        self.assertEqual(image.shape, (64, 64, 3))
-        expected_slice = np.array([0.4675, 0.5173, 0.4611, 0.4067, 0.5250, 0.4674, 0.5446, 0.5094, 0.4791])
-        self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-2)
-
-    def test_stable_diffusion_xl_turbo_euler_ancestral(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components(timestep_spacing="trailing")
-        gaudi_config = GaudiConfig(use_torch_autocast=False)
-
-        sd_pipe = GaudiStableDiffusionXLPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
-        sd_pipe.scheduler = GaudiEulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images[0]
-
-        image_slice = image[-3:, -3:, -1]
-
-        self.assertEqual(image.shape, (64, 64, 3))
-        expected_slice = np.array([0.4675, 0.5173, 0.4611, 0.4067, 0.5250, 0.4674, 0.5446, 0.5094, 0.4791])
-        self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-2)
-
-    @parameterized.expand(["pil", "np", "latent"])
-    def test_stable_diffusion_xl_output_types(self, output_type):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionXLPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        num_prompts = 2
-        num_images_per_prompt = 3
-
-        outputs = sd_pipe(
-            num_prompts * [prompt],
-            num_images_per_prompt=num_images_per_prompt,
-            num_inference_steps=2,
-            output_type=output_type,
-        )
-
-        self.assertEqual(len(outputs.images), 2 * 3)
-
-    def test_stable_diffusion_xl_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionXLPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # Test num_images_per_prompt=1 (default)
-        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
-
-        self.assertEqual(len(images), 1)
-        self.assertEqual(images[0].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt=1 (default) for several prompts
-        num_prompts = 3
-        images = sd_pipe([prompt] * num_prompts, num_inference_steps=2, output_type="np").images
-
-        self.assertEqual(len(images), num_prompts)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt for several prompts
-        num_prompts = 2
-        images = sd_pipe(
-            [prompt] * num_prompts,
-            num_inference_steps=2,
-            output_type="np",
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_xl_batch_sizes(self):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionXLPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # Test batch_size > 1 where batch_size is a divider of the total number of generated images
-        batch_size = 3
-        num_images_per_prompt = batch_size**2
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Same test for several prompts
-        num_prompts = 3
-        images = sd_pipe(
-            [prompt] * num_prompts,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Test batch_size when it is not a divider of the total number of generated images for a single prompt
-        num_images_per_prompt = 7
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Same test for several prompts
-        num_prompts = 2
-        images = sd_pipe(
-            [prompt] * num_prompts,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_xl_bf16(self):
-        """Test that stable diffusion works with bf16"""
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionXLPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images[0]
-
-        self.assertEqual(image.shape, (64, 64, 3))
-
-    def test_stable_diffusion_xl_default(self):
-        components = self.get_dummy_components()
-
-        sd_pipe = GaudiStableDiffusionXLPipeline(
-            use_habana=True,
-            gaudi_config="Habana/stable-diffusion",
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        images = sd_pipe(
-            [prompt] * 2,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=3,
-            num_images_per_prompt=5,
-        ).images
-
-        self.assertEqual(len(images), 10)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_xl_hpu_graphs(self):
-        components = self.get_dummy_components()
-
-        sd_pipe = GaudiStableDiffusionXLPipeline(
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config="Habana/stable-diffusion",
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        images = sd_pipe(
-            [prompt] * 2,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            batch_size=3,
-            num_images_per_prompt=5,
-        ).images
-
-        self.assertEqual(len(images), 10)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-
-class GaudiStableDiffusionControlNetPipelineTester(TestCase):
-    """
-    Tests the StableDiffusionControlNetPipeline for Gaudi.
-    """
-
-    def get_dummy_components(self, time_cond_proj_dim=None):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(4, 8),
-            layers_per_block=2,
-            sample_size=32,
-            time_cond_proj_dim=time_cond_proj_dim,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            norm_num_groups=1,
-        )
-
-        def init_weights(m):
-            if isinstance(m, torch.nn.Conv2d):
-                torch.nn.init.normal(m.weight)
-                m.bias.data.fill_(1.0)
-
-        torch.manual_seed(0)
-        controlnet = ControlNetModel(
-            block_out_channels=(4, 8),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32),
-            norm_num_groups=1,
-        )
-        controlnet.controlnet_down_blocks.apply(init_weights)
-
-        scheduler = GaudiDDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[4, 8],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        generator = torch.Generator(device=device).manual_seed(seed)
-        controlnet_embedder_scale_factor = 2
-        images = [
-            randn_tensor(
-                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
-                generator=generator,
-                device=torch.device(device),
-            ),
-        ]
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "np",
-            "image": images,
-        }
-        return inputs
-
-    def test_stable_diffusion_controlnet_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        prompt = inputs["prompt"]
-        # Test num_images_per_prompt=1 (default)
-        images = sd_pipe(**inputs).images
-
-        self.assertEqual(len(images), 1)
-        self.assertEqual(images[0].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt=1 (default) for several prompts
-        num_prompts = 3
-        inputs["prompt"] = [prompt] * num_prompts
-        images = sd_pipe(**inputs).images
-
-        self.assertEqual(len(images), num_prompts)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        inputs["prompt"] = prompt
-        images = sd_pipe(num_images_per_prompt=num_images_per_prompt, **inputs).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        ## Test num_images_per_prompt for several prompts
-        num_prompts = 2
-        inputs["prompt"] = [prompt] * num_prompts
-        images = sd_pipe(num_images_per_prompt=num_images_per_prompt, **inputs).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_controlnet_batch_sizes(self):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        prompt = inputs["prompt"]
-        # Test batch_size > 1 where batch_size is a divider of the total number of generated images
-        batch_size = 3
-        num_images_per_prompt = batch_size**2
-        images = sd_pipe(
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-            **inputs,
-        ).images
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Same test for several prompts
-        num_prompts = 3
-        inputs["prompt"] = [prompt] * num_prompts
-
-        images = sd_pipe(
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-            **inputs,
-        ).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        inputs["prompt"] = prompt
-        # Test batch_size when it is not a divider of the total number of generated images for a single prompt
-        num_images_per_prompt = 7
-        images = sd_pipe(
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-            **inputs,
-        ).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Same test for several prompts
-        num_prompts = 2
-        inputs["prompt"] = [prompt] * num_prompts
-        images = sd_pipe(batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, **inputs).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_controlnet_bf16(self):
-        """Test that stable diffusion works with bf16"""
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        image = sd_pipe(**inputs).images[0]
-
-        self.assertEqual(image.shape, (64, 64, 3))
-
-    def test_stable_diffusion_controlnet_default(self):
-        components = self.get_dummy_components()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            gaudi_config="Habana/stable-diffusion",
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        inputs["prompt"] = [inputs["prompt"]] * 2
-        images = sd_pipe(
-            batch_size=3,
-            num_images_per_prompt=5,
-            **inputs,
-        ).images
-
-        self.assertEqual(len(images), 10)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_controlnet_hpu_graphs(self):
-        components = self.get_dummy_components()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config="Habana/stable-diffusion",
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        inputs["prompt"] = [inputs["prompt"]] * 2
-
-        images = sd_pipe(
-            batch_size=3,
-            num_images_per_prompt=5,
-            **inputs,
-        ).images
-
-        self.assertEqual(len(images), 10)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-
-class GaudiStableDiffusionMultiControlNetPipelineTester(TestCase):
-    """
-    Tests the StableDiffusionControlNetPipeline for Gaudi.
-    """
-
-    def get_dummy_components(self, time_cond_proj_dim=None):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(4, 8),
-            layers_per_block=2,
-            sample_size=32,
-            time_cond_proj_dim=time_cond_proj_dim,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            norm_num_groups=1,
-        )
-
-        def init_weights(m):
-            if isinstance(m, torch.nn.Conv2d):
-                torch.nn.init.normal(m.weight)
-                m.bias.data.fill_(1.0)
-
-        torch.manual_seed(0)
-        controlnet1 = ControlNetModel(
-            block_out_channels=(4, 8),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32),
-            norm_num_groups=1,
-        )
-        controlnet1.controlnet_down_blocks.apply(init_weights)
-
-        torch.manual_seed(0)
-        controlnet2 = ControlNetModel(
-            block_out_channels=(4, 8),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32),
-            norm_num_groups=1,
-        )
-        controlnet2.controlnet_down_blocks.apply(init_weights)
-
-        scheduler = GaudiDDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[4, 8],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        controlnet = MultiControlNetModel([controlnet1, controlnet2])
-
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        generator = torch.Generator(device=device).manual_seed(seed)
-        controlnet_embedder_scale_factor = 2
-        images = [
-            randn_tensor(
-                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
-                generator=generator,
-                device=torch.device(device),
-            ),
-            randn_tensor(
-                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
-                generator=generator,
-                device=torch.device(device),
-            ),
-        ]
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "np",
-            "image": images,
-        }
-        return inputs
-
-    def test_stable_diffusion_multicontrolnet_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        prompt = inputs["prompt"]
-        # Test num_images_per_prompt=1 (default)
-        images = sd_pipe(**inputs).images
-
-        self.assertEqual(len(images), 1)
-        self.assertEqual(images[0].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt=1 (default) for several prompts
-        num_prompts = 3
-        inputs["prompt"] = [prompt] * num_prompts
-        images = sd_pipe(**inputs).images
-
-        self.assertEqual(len(images), num_prompts)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        inputs["prompt"] = prompt
-        images = sd_pipe(num_images_per_prompt=num_images_per_prompt, **inputs).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        ## Test num_images_per_prompt for several prompts
-        num_prompts = 2
-        inputs["prompt"] = [prompt] * num_prompts
-        images = sd_pipe(num_images_per_prompt=num_images_per_prompt, **inputs).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_multicontrolnet_batch_sizes(self):
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        prompt = inputs["prompt"]
-        # Test batch_size > 1 where batch_size is a divider of the total number of generated images
-        batch_size = 3
-        num_images_per_prompt = batch_size**2
-        images = sd_pipe(
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-            **inputs,
-        ).images
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Same test for several prompts
-        num_prompts = 3
-        inputs["prompt"] = [prompt] * num_prompts
-
-        images = sd_pipe(
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-            **inputs,
-        ).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        inputs["prompt"] = prompt
-        # Test batch_size when it is not a divider of the total number of generated images for a single prompt
-        num_images_per_prompt = 7
-        images = sd_pipe(
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-            **inputs,
-        ).images
-
-        self.assertEqual(len(images), num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-        # Same test for several prompts
-        num_prompts = 2
-        inputs["prompt"] = [prompt] * num_prompts
-        images = sd_pipe(batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, **inputs).images
-
-        self.assertEqual(len(images), num_prompts * num_images_per_prompt)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_multicontrolnet_bf16(self):
-        """Test that stable diffusion works with bf16"""
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            gaudi_config=gaudi_config,
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        image = sd_pipe(**inputs).images[0]
-
-        self.assertEqual(image.shape, (64, 64, 3))
-
-    def test_stable_diffusion_multicontrolnet_default(self):
-        components = self.get_dummy_components()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            gaudi_config="Habana/stable-diffusion",
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        inputs["prompt"] = [inputs["prompt"]] * 2
-        images = sd_pipe(
-            batch_size=3,
-            num_images_per_prompt=5,
-            **inputs,
-        ).images
-
-        self.assertEqual(len(images), 10)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-    def test_stable_diffusion_multicontrolnet_hpu_graphs(self):
-        components = self.get_dummy_components()
-
-        sd_pipe = GaudiStableDiffusionControlNetPipeline(
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config="Habana/stable-diffusion",
-            **components,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device="cpu")
-        inputs["prompt"] = [inputs["prompt"]] * 2
-
-        images = sd_pipe(
-            batch_size=3,
-            num_images_per_prompt=5,
-            **inputs,
-        ).images
-
-        self.assertEqual(len(images), 10)
-        self.assertEqual(images[-1].shape, (64, 64, 3))
-
-
-class TrainTextToImage(TestCase):
-    """
-    Tests the Stable Diffusion text_to_image Training for Gaudi.
-    """
-
-    def test_train_text_to_image_script(self):
-        path_to_script = (
-            Path(os.path.dirname(__file__)).parent
-            / "examples"
-            / "stable-diffusion"
-            / "training"
-            / "train_text_to_image_sdxl.py"
-        )
-
-        cmd_line = f"""ls {path_to_script}""".split()
-
-        # check find existence
-        p = subprocess.Popen(cmd_line)
-        return_code = p.wait()
-
-        # Ensure the run finished without any issue
-        self.assertEqual(return_code, 0)
-
-    @slow
-    @pytest.mark.skip(reason="The dataset used in this test is not available at the moment.")
-    def test_train_text_to_image_sdxl(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path_to_script = (
-                Path(os.path.dirname(__file__)).parent
-                / "examples"
-                / "stable-diffusion"
-                / "training"
-                / "train_text_to_image_sdxl.py"
-            )
-
-            cmd_line = f"""
-                 python3
-                 {path_to_script}
-                 --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0
-                 --pretrained_vae_model_name_or_path stabilityai/sdxl-vae
-                 --dataset_name lambdalabs/pokemon-blip-captions
-                 --resolution 512
-                 --crop_resolution 512
-                 --center_crop
-                 --random_flip
-                 --proportion_empty_prompts=0.2
-                 --train_batch_size 16
-                 --learning_rate 1e-05
-                 --max_grad_norm 1
-                 --lr_scheduler constant
-                 --lr_warmup_steps 0
-                 --gaudi_config_name Habana/stable-diffusion
-                 --throughput_warmup_steps 3
-                 --dataloader_num_workers 8
-                 --use_hpu_graphs_for_training
-                 --use_hpu_graphs_for_inference
-                 --bf16
-                 --max_train_steps 2
-                 --output_dir {tmpdir}
-                """.split()
-
-            # Run train_text_to_image_sdxl.y
-            p = subprocess.Popen(cmd_line)
-            return_code = p.wait()
-
-            # Ensure the run finished without any issue
-            self.assertEqual(return_code, 0)
-
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
-
-
-class TrainControlNet(TestCase):
-    """
-    Tests the train_controlnet.py script for Gaudi.
-    """
-
-    def test_train_controlnet_script(self):
-        path_to_script = (
-            Path(os.path.dirname(__file__)).parent
-            / "examples"
-            / "stable-diffusion"
-            / "training"
-            / "train_controlnet.py"
-        )
-
-        cmd_line = f"""ls {path_to_script}""".split()
-
-        # check find existence
-        p = subprocess.Popen(cmd_line)
-        return_code = p.wait()
-
-        # Ensure the run finished without any issue
-        self.assertEqual(return_code, 0)
-
-    @slow
-    def test_train_controlnet(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path_to_script = (
-                Path(os.path.dirname(__file__)).parent
-                / "examples"
-                / "stable-diffusion"
-                / "training"
-                / "train_controlnet.py"
-            )
-
-            download_files(
-                [
-                    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png",
-                    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png",
-                ],
-                path=tmpdir,
-            )
-
-            cmd_line = f"""
-                    python3
-                    {path_to_script.parent.parent.parent / 'gaudi_spawn.py'}
-                    --use_mpi
-                    --world_size 8
-                    {path_to_script}
-                    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5
-                    --dataset_name fusing/fill50k
-                    --resolution 512
-                    --train_batch_size 4
-                    --learning_rate 1e-05
-                    --validation_steps 1000
-                    --validation_image "{tmpdir}/conditioning_image_1.png" "{tmpdir}/conditioning_image_2.png"
-                    --validation_prompt "red circle with blue background" "cyan circle with brown floral background"
-                    --checkpointing_steps 1000
-                    --throughput_warmup_steps 3
-                    --use_hpu_graphs
-                    --bf16
-                    --num_train_epochs 1
-                    --output_dir {tmpdir}
-                """.split()
-
-            # Run train_controlnet.y
-            p = subprocess.Popen(cmd_line)
-            return_code = p.wait()
-
-            # Ensure the run finished without any issue
-            self.assertEqual(return_code, 0)
-
-            # Assess throughput
-            with open(Path(tmpdir) / "speed_metrics.json") as fp:
-                results = json.load(fp)
-            self.assertGreaterEqual(results["train_samples_per_second"], 0.95 * CONTROLNET_THROUGHPUT)
-            self.assertLessEqual(results["train_runtime"], 1.05 * CONTROLNET_RUNTIME)
-
-            # Assess generated image
-            controlnet = ControlNetModel.from_pretrained(tmpdir, torch_dtype=torch.bfloat16)
-            pipe = GaudiStableDiffusionControlNetPipeline.from_pretrained(
-                "runwayml/stable-diffusion-v1-5",
-                controlnet=controlnet,
-                torch_dtype=torch.bfloat16,
-                use_habana=True,
-                use_hpu_graphs=True,
-                gaudi_config=GaudiConfig(use_habana_mixed_precision=False),
-            )
-            pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-
-            control_image = load_image(f"{tmpdir}/conditioning_image_1.png")
-            prompt = "pale golden rod circle with old lace background"
-
-            generator = set_seed(27)
-            image = pipe(
-                prompt, num_inference_steps=20, generator=generator, image=control_image, output_type="np"
-            ).images[0]
-
-            self.assertEqual(image.shape, (512, 512, 3))
-
-
-def install_requirements(requirements_filename: Union[str, os.PathLike]):
-    """
-    Installs the necessary requirements to run the example if the provided file exists, otherwise does nothing.
-    """
-
-    if not Path(requirements_filename).exists():
-        return
-
-    cmd_line = f"pip install -r {requirements_filename}".split()
-    p = subprocess.Popen(cmd_line)
-    return_code = p.wait()
-    assert return_code == 0
-
-
-class DreamBooth(TestCase):
-    def _test_dreambooth(self, extra_config, train_text_encoder=False):
-        path_to_script = (
-            Path(os.path.dirname(__file__)).parent
-            / "examples"
-            / "stable-diffusion"
-            / "training"
-            / "train_dreambooth.py"
-        )
-        install_requirements(path_to_script.parent / "requirements.txt")
-        instance_prompt = "soccer player kicking a ball"
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                python3
-                {path_to_script}
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --instance_data_dir {Path(os.path.dirname(__file__))/'resource/img'}
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --train_text_encoder
-                --max_train_steps 1
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --gaudi_config_name Habana/stable-diffusion
-                --output_dir {tmpdir}
-                """.split()
-
-            test_args.append("--instance_prompt")
-            test_args.append(instance_prompt)
-            if "oft" not in extra_config:
-                test_args.append("--use_hpu_graphs_for_training")
-                test_args.append("--use_hpu_graphs_for_inference")
-            if train_text_encoder:
-                test_args.append("--train_text_encoder")
-            test_args.append(extra_config)
-            p = subprocess.Popen(test_args)
-            return_code = p.wait()
-
-            # Ensure the run finished without any issue
-            self.assertEqual(return_code, 0)
-            # save_pretrained smoke test
-            if "full" in extra_config:
-                self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
-                if train_text_encoder:
-                    self.assertTrue(os.path.isfile(os.path.join(tmpdir, "text_encoder", "model.safetensors")))
-                self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
-            else:
-                self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "adapter_model.safetensors")))
-                if train_text_encoder:
-                    self.assertTrue(os.path.isfile(os.path.join(tmpdir, "text_encoder", "adapter_model.safetensors")))
-
-    def test_dreambooth_full(self):
-        self._test_dreambooth("full")
-
-    def test_dreambooth_full_with_text_encoder(self):
-        self._test_dreambooth("full", train_text_encoder=True)
-
-    def test_dreambooth_lora(self):
-        self._test_dreambooth("lora")
-
-    def test_dreambooth_lora_with_text_encoder(self):
-        self._test_dreambooth("lora", train_text_encoder=True)
-
-    def test_dreambooth_lokr(self):
-        self._test_dreambooth("lokr")
-
-    def test_dreambooth_lokr_with_text_encoder(self):
-        self._test_dreambooth("lokr", train_text_encoder=True)
-
-    def test_dreambooth_loha(self):
-        self._test_dreambooth("loha")
-
-    def test_dreambooth_loha_with_text_encoder(self):
-        self._test_dreambooth("loha", train_text_encoder=True)
-
-    def test_dreambooth_oft(self):
-        self._test_dreambooth("oft")
-
-    def test_dreambooth_oft_with_text_encoder(self):
-        self._test_dreambooth("oft", train_text_encoder=True)
-
-
-class DreamBoothLoRASDXL(TestCase):
-    def _test_dreambooth_lora_sdxl(self, train_text_encoder=False):
-        path_to_script = (
-            Path(os.path.dirname(__file__)).parent
-            / "examples"
-            / "stable-diffusion"
-            / "training"
-            / "train_dreambooth_lora_sdxl.py"
-        )
-        install_requirements(path_to_script.parent / "requirements.txt")
-
-        instance_prompt = "soccer player kicking a ball"
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                python3
-                {path_to_script}
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --instance_data_dir {Path(os.path.dirname(__file__))/'resource/img'}
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 1
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --gaudi_config_name Habana/stable-diffusion
-                --use_hpu_graphs_for_training
-                --use_hpu_graphs_for_inference
-                --output_dir {tmpdir}
-                """.split()
-            if train_text_encoder:
-                test_args.append("--train_text_encoder")
-            test_args.append("--instance_prompt")
-            test_args.append(instance_prompt)
-            p = subprocess.Popen(test_args)
-            return_code = p.wait()
-
-            # Ensure the run finished without any issue
-            self.assertEqual(return_code, 0)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` in their names.
-            if train_text_encoder:
-                starts_with_unet = all(
-                    k.startswith("unet") or k.startswith("text_encoder") or k.startswith("text_encoder_2")
-                    for k in lora_state_dict.keys()
-                )
-            else:
-                starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_unet)
-
-    def test_dreambooth_lora_sdxl_with_text_encoder(self):
-        self._test_dreambooth_lora_sdxl(train_text_encoder=True)
-
-    def test_dreambooth_lora_sdxl(self):
-        self._test_dreambooth_lora_sdxl(train_text_encoder=False)
-
-
-class GaudiStableVideoDiffusionPipelineTester(TestCase):
-    """
-    Tests the StableVideoDiffusionPipeline for Gaudi.
-    Adapted from: https://github.com/huggingface/diffusers/blob/v0.24.0-release/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
-    """
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNetSpatioTemporalConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=8,
-            out_channels=4,
-            down_block_types=(
-                "CrossAttnDownBlockSpatioTemporal",
-                "DownBlockSpatioTemporal",
-            ),
-            up_block_types=("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal"),
-            cross_attention_dim=32,
-            num_attention_heads=8,
-            projection_class_embeddings_input_dim=96,
-            addition_time_embed_dim=32,
-        )
-        scheduler = GaudiEulerDiscreteScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            interpolation_type="linear",
-            num_train_timesteps=1000,
-            prediction_type="v_prediction",
-            sigma_max=700.0,
-            sigma_min=0.002,
-            steps_offset=1,
-            timestep_spacing="leading",
-            timestep_type="continuous",
-            trained_betas=None,
-            use_karras_sigmas=True,
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKLTemporalDecoder(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            latent_channels=4,
-        )
-
-        torch.manual_seed(0)
-        config = CLIPVisionConfig(
-            hidden_size=32,
-            projection_dim=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            image_size=32,
-            intermediate_size=37,
-            patch_size=1,
-        )
-        image_encoder = CLIPVisionModelWithProjection(config)
-
-        torch.manual_seed(0)
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-        components = {
-            "unet": unet,
-            "image_encoder": image_encoder,
-            "scheduler": scheduler,
-            "vae": vae,
-            "feature_extractor": feature_extractor,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device="cpu").manual_seed(seed)
-
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(0)).to(device)
-        inputs = {
-            "generator": generator,
-            "image": image,
-            "num_inference_steps": 2,
-            "output_type": "pt",
-            "min_guidance_scale": 1.0,
-            "max_guidance_scale": 2.5,
-            "num_frames": 2,
-            "height": 32,
-            "width": 32,
-        }
-        return inputs
-
-    def test_stable_video_diffusion_single_video(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        gaudi_config = GaudiConfig(use_torch_autocast=False)
-        sd_pipe = GaudiStableVideoDiffusionPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
-        for component in sd_pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-
-        sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        outputs = sd_pipe(
-            **self.get_dummy_inputs(device),
-        ).frames
-        image = outputs[0]
-        image_slice = image[0, -3:, -3:, -1]
-
-        self.assertEqual(len(outputs), 1)
-        self.assertEqual(image.shape, (2, 3, 32, 32))
-
-        expected_slice = np.array([0.5910, 0.5797, 0.5521, 0.6628, 0.6212, 0.6422, 0.5681, 0.5232, 0.5343])
-
-        self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-2)
-
-    @slow
-    def test_stable_video_diffusion_no_throughput_regression_bf16(self):
-        image_url = (
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png"
-        )
-        model_name = "stabilityai/stable-video-diffusion-img2vid-xt"
-        scheduler = GaudiEulerDiscreteScheduler.from_pretrained(model_name, subfolder="scheduler")
-
-        pipeline = GaudiStableVideoDiffusionPipeline.from_pretrained(
-            model_name,
-            scheduler=scheduler,
-            use_habana=True,
-            use_hpu_graphs=True,
-            gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
-            torch_dtype=torch.bfloat16,
-        )
-        set_seed(42)
-        prompt_image = load_image(image_url)
-        outputs = pipeline(
-            image=prompt_image,
-            num_videos_per_prompt=1,
-            batch_size=1,
-            height=256,
-            width=256,
-        )
-
-        self.assertEqual(len(outputs.frames[0]), 25)
-        if IS_GAUDI2:
-            self.assertGreaterEqual(outputs.throughput, 0.95 * 0.012)
diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py
deleted file mode 100644
index 06d03ff92d..0000000000
--- a/tests/test_encoder_decoder.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import json
-import os
-import re
-import subprocess
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import List
-
-import pytest
-
-from .test_examples import ACCURACY_PERF_FACTOR, TIME_PERF_FACTOR
-
-
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = {
-        "summarization": {
-            "bf16": [
-                ("facebook/bart-large-cnn", "Habana/bart", 3.9, 28.9801, 2, 2),
-                ("t5-3b", "Habana/t5", 2.955, 21.8877, 2, 1),
-            ],
-        },
-        "translation": {
-            "bf16": [
-                ("Babelscape/mrebel-large", "Habana/t5", 1.323, 0.1618, 2, 1),
-                ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.815, 0.8132, 2, 1),
-                ("facebook/nllb-200-distilled-600M", "Habana/t5", 1.401, 1.2599, 2, 1),
-                ("t5-small", "Habana/t5", 14.482, 11.7277, 2, 1),
-            ],
-        },
-    }
-else:
-    # Gaudi1 CI baselines
-    MODELS_TO_TEST = {
-        "summarization": {
-            "bf16": [
-                ("facebook/bart-large-cnn", "Habana/bart", 2.304, 29.174, 2, 2),
-                ("t5-3b", "Habana/t5", 1.005, 21.7286, 2, 1),
-            ],
-        },
-        "translation": {
-            "bf16": [
-                ("Babelscape/mrebel-large", "Habana/t5", 0.995, 0.1784, 2, 1),
-                ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.409, 0.7995, 2, 1),
-                ("facebook/nllb-200-distilled-600M", "Habana/t5", 0.998, 1.2457, 2, 1),
-                ("t5-small", "Habana/t5", 9.188, 11.6126, 2, 1),
-            ],
-        },
-    }
-
-
-class TestEncoderDecoderModels:
-    PATH_TO_EXAMPLE_DIR = Path(__file__).resolve().parent.parent / "examples"
-
-    def _install_requirements(self, task: str):
-        cmd_line = f"pip install -r {self.PATH_TO_EXAMPLE_DIR / task / 'requirements.txt'}".split()
-        p = subprocess.Popen(cmd_line)
-        return_code = p.wait()
-        assert return_code == 0
-
-    def _build_command(
-        self,
-        task: str,
-        deepspeed: bool = False,
-        world_size: int = 8,
-        command_args: List[str] = None,
-    ):
-        command = ["python3"]
-
-        if deepspeed:
-            command += [
-                f"{self.PATH_TO_EXAMPLE_DIR / 'gaudi_spawn.py'}",
-                "--use_deepspeed",
-                f"--world_size {world_size}",
-            ]
-
-        if command_args is not None:
-            command += command_args
-
-        if not deepspeed:
-            command.append("--bf16")
-
-        return command
-
-    def _run_test(
-        self,
-        command: List[str],
-        task: str,
-        baseline: float,
-        baseline_acc: float,
-    ):
-        with TemporaryDirectory() as tmp_dir:
-            command.append(f"--output_dir {tmp_dir}")
-            print(f"\n\nCommand to test: {' '.join(command)}\n")
-
-            pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
-            command = [x for y in command for x in re.split(pattern, y) if x]
-
-            proc = subprocess.run(command)
-
-            # Ensure the run finished without any issue
-            # Use try-except to avoid logging the token if used
-            try:
-                assert proc.returncode == 0
-            except AssertionError as e:
-                if "'--token', 'hf_" in e.args[0]:
-                    e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
-                raise
-
-            with open(Path(tmp_dir) / "predict_results.json") as fp:
-                results = json.load(fp)
-
-        # Ensure performance requirements (throughput) are met
-        assert results["predict_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline
-
-        if task == "summarization":
-            accuracy_metric = "predict_rougeLsum"
-        elif task == "translation":
-            accuracy_metric = "predict_bleu"
-        assert results[accuracy_metric] >= ACCURACY_PERF_FACTOR * baseline_acc
-
-    def _test_text_summarization(
-        self,
-        model_name: str,
-        gaudi_config: str,
-        baseline: float,
-        baseline_acc: float,
-        batch_size: int,
-        num_beams: int,
-        token: str,
-        deepspeed: bool = False,
-        world_size: int = 8,
-    ):
-        task = "summarization"
-
-        # Install summarization example requirements
-        self._install_requirements(task)
-
-        command_args = [
-            str(self.PATH_TO_EXAMPLE_DIR / task / f"run_{task}.py"),
-            f"--model_name_or_path {model_name}",
-            "--do_predict",
-            "--predict_with_generate",
-            "--dataset_name cnn_dailymail",
-            "--dataset_config 3.0.0",
-            "--use_habana",
-            f"--per_device_eval_batch_size {batch_size}",
-            f"--gaudi_config_name {gaudi_config}",
-            f"--num_beams {num_beams}",
-            "--ignore_pad_token_for_loss False",
-            "--pad_to_max_length",
-            "--use_hpu_graphs_for_inference",
-            "--use_lazy_mode",
-            "--max_predict_samples 200",
-        ]
-
-        command = self._build_command(
-            task=task,
-            deepspeed=deepspeed,
-            world_size=world_size,
-            command_args=command_args,
-        )
-
-        if not deepspeed and model_name == "t5-3b":
-            command.append("--bf16_full_eval")
-
-        self._run_test(command, task, baseline, baseline_acc)
-
-    def _test_text_translation(
-        self,
-        model_name: str,
-        gaudi_config: str,
-        baseline: float,
-        baseline_acc: float,
-        batch_size: int,
-        num_beams: int,
-        token: str,
-        deepspeed: bool = False,
-        world_size: int = 8,
-    ):
-        task = "translation"
-
-        # Install summarization example requirements
-        self._install_requirements(task)
-
-        command_args = [
-            str(self.PATH_TO_EXAMPLE_DIR / task / f"run_{task}.py"),
-            f"--model_name_or_path {model_name}",
-            "--do_predict",
-            "--source_lang en",
-            "--target_lang ro",
-            '--source_prefix "translate English to Romanian: "' "--dataset_name wmt16",
-            "--dataset_config_name ro-en",
-            f"--per_device_eval_batch_size {batch_size}",
-            f"--generation_num_beams {num_beams}",
-            "--predict_with_generate",
-            "--use_habana",
-            "--use_lazy_mode",
-            "--use_hpu_graphs_for_inference",
-            f"--gaudi_config_name {gaudi_config}",
-            "--ignore_pad_token_for_loss False",
-            "--pad_to_max_length",
-            "--max_predict_samples 200",
-        ]
-
-        if "opus-mt-zh-en" in model_name:
-            command_args.append("--max_source_length 512")
-
-        command = self._build_command(
-            task=task,
-            deepspeed=deepspeed,
-            world_size=world_size,
-            command_args=command_args,
-        )
-
-        self._run_test(command, task, baseline, baseline_acc)
-
-    @pytest.mark.parametrize(
-        "model_name, gaudi_config, baseline, baseline_acc, batch_size, num_beams",
-        MODELS_TO_TEST["summarization"]["bf16"],
-    )
-    def test_text_summarization_bf16(
-        self,
-        model_name: str,
-        gaudi_config: str,
-        baseline: float,
-        baseline_acc: float,
-        batch_size: int,
-        num_beams: int,
-        token: str,
-    ):
-        self._test_text_summarization(model_name, gaudi_config, baseline, baseline_acc, batch_size, num_beams, token)
-
-    @pytest.mark.parametrize(
-        "model_name, gaudi_config, baseline, baseline_acc, batch_size, num_beams",
-        MODELS_TO_TEST["translation"]["bf16"],
-    )
-    def test_text_translation_bf16(
-        self,
-        model_name: str,
-        gaudi_config: str,
-        baseline: float,
-        baseline_acc: float,
-        batch_size: int,
-        num_beams: int,
-        token: str,
-    ):
-        self._test_text_translation(model_name, gaudi_config, baseline, baseline_acc, batch_size, num_beams, token)
diff --git a/tests/test_examples.py b/tests/test_examples.py
deleted file mode 100755
index e1a3fa1dab..0000000000
--- a/tests/test_examples.py
+++ /dev/null
@@ -1,800 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import re
-import subprocess
-from distutils.util import strtobool
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import Callable, Dict, List, Optional, Tuple, Union
-from unittest import TestCase
-
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    MODEL_FOR_CTC_MAPPING,
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-    MODEL_FOR_MASKED_LM_MAPPING,
-    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
-    MODEL_MAPPING,
-)
-from transformers.testing_utils import slow
-
-from .utils import (
-    MODELS_TO_TEST_FOR_AUDIO_CLASSIFICATION,
-    MODELS_TO_TEST_FOR_CAUSAL_LANGUAGE_MODELING,
-    MODELS_TO_TEST_FOR_IMAGE_CLASSIFICATION,
-    MODELS_TO_TEST_FOR_IMAGE_TEXT,
-    MODELS_TO_TEST_FOR_MASKED_LANGUAGE_MODELING,
-    MODELS_TO_TEST_FOR_QUESTION_ANSWERING,
-    MODELS_TO_TEST_FOR_SEQ2SEQ,
-    MODELS_TO_TEST_FOR_SEQUENCE_CLASSIFICATION,
-    MODELS_TO_TEST_FOR_SPEECH_RECOGNITION,
-    MODELS_TO_TEST_MAPPING,
-)
-
-
-BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("baselines")
-# Models should reach at least 99% of their baseline accuracy
-ACCURACY_PERF_FACTOR = 0.99
-# Trainings/Evaluations should last at most 5% longer than the baseline
-TIME_PERF_FACTOR = 1.05
-
-
-IS_GAUDI2 = os.environ.get("GAUDI2_CI", "0") == "1"
-
-
-def _get_supported_models_for_script(
-    models_to_test: Dict[str, List[Tuple[str]]],
-    task_mapping: Dict[str, str],
-    valid_models_for_task: List[str],
-) -> List[Tuple[str]]:
-    """
-    Filter models that can perform the task from models_to_test.
-    Args:
-        models_to_test: mapping between a model type and a tuple (model_name_or_path, gaudi_config_name).
-        task_mapping: mapping between a model config and a model class.
-        valid_models_for_task: list of models to test for a specific task.
-    Returns:
-        A list of models that are supported for the task.
-        Each element of the list follows the same format: (model_type, (model_name_or_path, gaudi_config_name)).
-    """
-
-    def is_valid_model_type(model_type: str) -> bool:
-        true_model_type = "llama" if model_type == "llama_guard" else model_type
-        if model_type == "protst":
-            in_task_mapping = True
-        else:
-            # llama_guard is not a model type in Transformers so CONFIG_MAPPING wouldn't find it
-            in_task_mapping = CONFIG_MAPPING[true_model_type] in task_mapping
-        in_valid_models_for_task = model_type in valid_models_for_task
-        if in_task_mapping and in_valid_models_for_task:
-            return True
-        return False
-
-    return [
-        model for model_type, models in models_to_test.items() for model in models if is_valid_model_type(model_type)
-    ]
-
-
-_SCRIPT_TO_MODEL_MAPPING = {
-    "run_qa": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODELS_TO_TEST_FOR_QUESTION_ANSWERING,
-    ),
-    "run_glue": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODELS_TO_TEST_FOR_SEQUENCE_CLASSIFICATION,
-    ),
-    "run_clm": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_CAUSAL_LM_MAPPING,
-        MODELS_TO_TEST_FOR_CAUSAL_LANGUAGE_MODELING,
-    ),
-    "run_summarization": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        MODELS_TO_TEST_FOR_SEQ2SEQ,
-    ),
-    "run_image_classification": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-        MODELS_TO_TEST_FOR_IMAGE_CLASSIFICATION,
-    ),
-    "run_mlm": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_MASKED_LM_MAPPING,
-        MODELS_TO_TEST_FOR_MASKED_LANGUAGE_MODELING,
-    ),
-    "run_audio_classification": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
-        MODELS_TO_TEST_FOR_AUDIO_CLASSIFICATION,
-    ),
-    "run_speech_recognition_ctc": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_CTC_MAPPING,
-        MODELS_TO_TEST_FOR_SPEECH_RECOGNITION,
-    ),
-    "run_seq2seq_qa": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        MODELS_TO_TEST_FOR_SEQ2SEQ,
-    ),
-    "run_clip": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_MAPPING,
-        MODELS_TO_TEST_FOR_IMAGE_TEXT,
-    ),
-    "run_bridgetower": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_MAPPING,
-        ["bridgetower"],
-    ),
-    "run_lora_clm": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_CAUSAL_LM_MAPPING,
-        ["llama", "falcon"],
-    ),
-    "run_speech_recognition_seq2seq": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
-        MODELS_TO_TEST_FOR_SPEECH_RECOGNITION,
-    ),
-    "sft": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_CAUSAL_LM_MAPPING,
-        ["llama"],
-    ),
-    "dpo": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_CAUSAL_LM_MAPPING,
-        ["llama"],
-    ),
-    "run_prompt_tuning_clm": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_FOR_CAUSAL_LM_MAPPING,
-        ["llama"],
-    ),
-    "run_sequence_classification": _get_supported_models_for_script(
-        MODELS_TO_TEST_MAPPING,
-        MODEL_MAPPING,
-        ["protst"],
-    ),
-}
-
-
-class ExampleTestMeta(type):
-    """
-    Metaclass that takes care of creating the proper example tests for a given task.
-    It uses example_name to figure out which models support this task, and create a run example test for each of these
-    models.
-    """
-
-    @staticmethod
-    def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: str, fsdp: bool, fp8: bool):
-        models_with_specific_rules = [
-            "albert-xxlarge-v1",
-            "gpt2-xl",
-            "facebook/wav2vec2-base",
-            "facebook/wav2vec2-large-lv60",
-            "BridgeTower/bridgetower-large-itm-mlm-itc",
-            "EleutherAI/gpt-neox-20b",
-            "google/flan-t5-xxl",
-            "tiiuae/falcon-40b",
-            "bigscience/bloom-7b1",
-            "codellama/CodeLlama-13b-Instruct-hf",
-            "MIT/ast-finetuned-speech-commands-v2",
-            "meta-llama/LlamaGuard-7b",
-        ]
-
-        if fsdp and not IS_GAUDI2:
-            return False
-        elif (
-            "sft" in example_name
-            or "dpo" in example_name
-            or "prompt_tuning" in example_name
-            or example_name == "run_sequence_classification"
-        ) and not IS_GAUDI2:
-            return False
-        elif model_name not in models_with_specific_rules and not deepspeed:
-            return True
-        elif model_name == "gpt2-xl" and deepspeed:
-            # GPT2-XL is tested only with DeepSpeed
-            return True
-        elif "gpt-neox" in model_name and IS_GAUDI2 and deepspeed:
-            # GPT-NeoX is tested only on Gaudi2 and with DeepSpeed
-            return True
-        elif "flan-t5" in model_name and IS_GAUDI2 and deepspeed:
-            # Flan-T5 is tested only on Gaudi2 and with DeepSpeed
-            return True
-        elif "CodeLlama" in model_name and IS_GAUDI2 and deepspeed:
-            # CodeLlama is tested only on Gaudi2 and with DeepSpeed
-            return True
-        elif model_name == "albert-xxlarge-v1":
-            if (("RUN_ALBERT_XXL_1X" in os.environ) and strtobool(os.environ["RUN_ALBERT_XXL_1X"])) or multi_card:
-                # ALBERT XXL 1X is tested only if the required flag is present because it takes long
-                return True
-        elif "wav2vec2-base" in model_name and example_name == "run_audio_classification":
-            return True
-        elif "wav2vec2-large" in model_name and example_name == "run_speech_recognition_ctc":
-            return True
-        elif "bridgetower" in model_name and IS_GAUDI2:
-            return True
-        elif "falcon" in model_name and IS_GAUDI2 and not fsdp and not fp8:
-            return True
-        elif "bloom" in model_name and deepspeed and not IS_GAUDI2:
-            return True
-        elif "LlamaGuard" in model_name and deepspeed and IS_GAUDI2:
-            return True
-        elif "ast-finetuned-speech-commands-v2" in model_name and IS_GAUDI2:
-            return True
-
-        return False
-
-    def __new__(
-        cls,
-        name,
-        bases,
-        attrs,
-        example_name=None,
-        multi_card=False,
-        deepspeed=False,
-        fsdp=False,
-        torch_compile=False,
-        fp8=False,
-    ):
-        distribution = "single_card"
-        if multi_card:
-            distribution = "multi_card"
-        elif deepspeed:
-            distribution = "deepspeed"
-
-        if example_name is not None:
-            models_to_test = _SCRIPT_TO_MODEL_MAPPING.get(example_name)
-            if models_to_test is None:
-                if example_name in ["run_esmfold", "run_lora_clm", "run_zero_shot_eval"]:
-                    attrs[f"test_{example_name}_{distribution}"] = cls._create_test(None, None, None, None, None)
-                    attrs["EXAMPLE_NAME"] = example_name
-                    return super().__new__(cls, name, bases, attrs)
-                else:
-                    raise AttributeError(
-                        f"Could not create class because no model was found for example {example_name}"
-                    )
-
-        for model_name, gaudi_config_name in models_to_test:
-            if cls.to_test(model_name, multi_card, deepspeed, example_name, fsdp, fp8):
-                attrs[f"test_{example_name}_{model_name.split('/')[-1]}_{distribution}"] = cls._create_test(
-                    model_name, gaudi_config_name, multi_card, deepspeed, fsdp, torch_compile, fp8
-                )
-        attrs["EXAMPLE_NAME"] = example_name
-        return super().__new__(cls, name, bases, attrs)
-
-    @classmethod
-    def _create_test(
-        cls,
-        model_name: str,
-        gaudi_config_name: str,
-        multi_card: bool = False,
-        deepspeed: bool = False,
-        fsdp: bool = False,
-        torch_compile: bool = False,
-        fp8: bool = False,
-    ) -> Callable[[], None]:
-        """
-        Create a test function that runs an example for a specific (model_name, gaudi_config_name) pair.
-        Args:
-            model_name (str): the model_name_or_path.
-            gaudi_config_name (str): the gaudi config name.
-            multi_card (bool): whether it is a distributed run or not.
-            deepspeed (bool): whether deepspeed should be used or not.
-        Returns:
-            The test function that runs the example.
-        """
-
-        @slow
-        def test(self):
-            if self.EXAMPLE_NAME is None:
-                raise ValueError("An example name must be provided")
-            example_script = Path(self.EXAMPLE_DIR).glob(f"*/{self.EXAMPLE_NAME}.py")
-            example_script = list(example_script)
-            if len(example_script) == 0:
-                raise RuntimeError(f"Could not find {self.EXAMPLE_NAME}.py in examples located in {self.EXAMPLE_DIR}")
-            elif len(example_script) > 1:
-                raise RuntimeError(f"Found more than {self.EXAMPLE_NAME}.py in examples located in {self.EXAMPLE_DIR}")
-            else:
-                example_script = example_script[0]
-
-            # The ESMFold example has no arguments, so we can execute it right away
-            if self.EXAMPLE_NAME == "run_esmfold":
-                p = subprocess.Popen(["python3", example_script])
-                return_code = p.wait()
-                # Ensure the run finished without any issue
-                self.assertEqual(return_code, 0)
-                return
-            elif self.EXAMPLE_NAME == "run_zero_shot_eval":
-                with TemporaryDirectory() as tmp_dir:
-                    cmd_line = f"""
-                        python3
-                        {example_script}
-                        --output_dir {tmp_dir}
-                        --bf16
-                        --max_seq_length 1024
-                    """.split()
-                    p = subprocess.Popen(cmd_line)
-                    return_code = p.wait()
-                    # Ensure the run finished without any issue
-                    self.assertEqual(return_code, 0)
-                    # Assess accuracy
-                    with open(Path(tmp_dir) / "accuracy_metrics.json") as fp:
-                        results = json.load(fp)
-                        baseline = 0.43 if os.environ.get("GAUDI2_CI", "0") == "1" else 0.42
-                        self.assertGreaterEqual(results["accuracy"], baseline)
-                return
-            elif self.EXAMPLE_NAME == "run_clip":
-                if os.environ.get("DATA_CACHE", None) is None:
-                    from .clip_coco_utils import COCO_URLS, download_files
-
-                    download_files(COCO_URLS)
-                from .clip_coco_utils import create_clip_roberta_model
-
-                create_clip_roberta_model()
-
-            self._install_requirements(example_script.parent / "requirements.txt")
-
-            path_to_baseline = BASELINE_DIRECTORY / Path(model_name.split("/")[-1].replace("-", "_")).with_suffix(
-                ".json"
-            )
-            with path_to_baseline.open("r") as json_file:
-                device = "gaudi2" if IS_GAUDI2 else "gaudi"
-                baseline = json.load(json_file)[device]
-                if isinstance(self.TASK_NAME, list):
-                    for key in self.TASK_NAME:
-                        if key in baseline:
-                            baseline = baseline[key]
-                            break
-                    if "num_train_epochs" not in baseline:
-                        raise ValueError(
-                            f"Couldn't find a baseline associated to any of these tasks: {self.TASK_NAME}."
-                        )
-                    self.TASK_NAME = key
-                else:
-                    baseline = baseline[self.TASK_NAME]
-
-            distribution = "single_card"
-            if multi_card:
-                distribution = "multi_card"
-            elif deepspeed:
-                distribution = "deepspeed"
-
-            env_variables = os.environ.copy()
-            if "falcon" in model_name:
-                env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt")
-            elif "flan" in model_name:
-                env_variables["PT_HPU_MAX_COMPOUND_OP_SIZE"] = "512"
-            elif "bloom" in model_name:
-                env_variables["DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED"] = "1"
-                env_variables["PT_HPU_MAX_COMPOUND_OP_SYNC"] = "1"
-                env_variables["PT_HPU_MAX_COMPOUND_OP_SIZE"] = "1"
-            elif fsdp:
-                if "llama" in model_name:
-                    env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt")
-                env_variables["PT_HPU_LAZY_MODE"] = "0"
-            elif deepspeed and "gpt-neox-20b" in model_name:
-                env_variables["LD_PRELOAD"] = ""
-
-            if fp8 and "llama" in model_name:
-                env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt")
-
-            extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])
-
-            if os.environ.get("DATA_CACHE", None) is not None and self.EXAMPLE_NAME == "run_clip":
-                extra_command_line_arguments[0] = "--data_dir {}".format(os.environ["DATA_CACHE"])
-            elif torch_compile and (
-                model_name == "bert-large-uncased-whole-word-masking" or model_name == "roberta-large"
-            ):
-                extra_command_line_arguments.append("--torch_compile_backend hpu_backend")
-                extra_command_line_arguments.append("--torch_compile")
-                if "--use_hpu_graphs_for_inference" in extra_command_line_arguments:
-                    extra_command_line_arguments.remove("--use_hpu_graphs_for_inference")
-                env_variables["PT_HPU_LAZY_MODE"] = "0"
-                env_variables["PT_ENABLE_INT64_SUPPORT"] = "1"
-
-            with TemporaryDirectory() as tmp_dir:
-                cmd_line = self._create_command_line(
-                    multi_card,
-                    deepspeed,
-                    fsdp,
-                    example_script,
-                    model_name,
-                    gaudi_config_name,
-                    tmp_dir,
-                    task=self.TASK_NAME,
-                    lr=baseline.get("distribution").get(distribution).get("learning_rate"),
-                    train_batch_size=baseline.get("distribution").get(distribution).get("train_batch_size"),
-                    eval_batch_size=baseline.get("eval_batch_size"),
-                    num_epochs=baseline.get("num_train_epochs"),
-                    extra_command_line_arguments=extra_command_line_arguments,
-                )
-
-                p = subprocess.Popen(cmd_line, env=env_variables)
-                return_code = p.wait()
-
-                # Ensure the run finished without any issue
-                self.assertEqual(return_code, 0)
-
-                with open(Path(tmp_dir) / "all_results.json") as fp:
-                    results = json.load(fp)
-
-                # Ensure performance requirements (accuracy, training time) are met
-                self.assert_no_regression(results, baseline.get("distribution").get(distribution), model_name)
-
-            # TODO: is a cleanup of the dataset cache needed?
-            # self._cleanup_dataset_cache()
-
-        return test
-
-
-class ExampleTesterBase(TestCase):
-    """
-    Base example tester class.
-    Attributes:
-        EXAMPLE_DIR (`str` or `os.Pathlike`): the directory containing the examples.
-        EXAMPLE_NAME (`str`): the name of the example script without the file extension, e.g. run_qa, run_glue, etc.
-        TASK_NAME (`str`): the name of the dataset to use.
-        DATASET_PARAMETER_NAME (`str`): the argument name to use for the dataset parameter.
-            Most of the time it will be "dataset_name", but for some tasks on a benchmark it might be something else.
-        MAX_SEQ_LENGTH ('str'): the max_seq_length argument for this dataset.
-            The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.
-    """
-
-    EXAMPLE_DIR = Path(os.path.dirname(__file__)).parent / "examples"
-    EXAMPLE_NAME = None
-    TASK_NAME = None
-    DATASET_PARAMETER_NAME = "dataset_name"
-    DATASET_NAME = None
-    REGRESSION_METRICS = {
-        "eval_f1": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "eval_accuracy": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "perplexity": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR),
-        "eval_rougeLsum": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "train_runtime": (TestCase.assertLessEqual, TIME_PERF_FACTOR),
-        "eval_wer": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR),
-        "train_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR),
-        "eval_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR),
-    }
-
-    def _create_command_line(
-        self,
-        multi_card: bool,
-        deepspeed: bool,
-        fsdp: bool,
-        script: Path,
-        model_name: str,
-        gaudi_config_name: str,
-        output_dir: str,
-        lr: float,
-        train_batch_size: int,
-        eval_batch_size: int,
-        num_epochs: int,
-        task: Optional[str] = None,
-        extra_command_line_arguments: Optional[List[str]] = None,
-    ) -> List[str]:
-        dataset_name = self.DATASET_NAME if self.DATASET_NAME is not None else task
-        task_option = f"--{self.DATASET_PARAMETER_NAME} {dataset_name}" if task else " "
-
-        cmd_line = ["python3"]
-        if multi_card:
-            cmd_line.append(f"{script.parent.parent / 'gaudi_spawn.py'}")
-            cmd_line.append("--world_size 8")
-            cmd_line.append("--use_mpi")
-        elif deepspeed:
-            cmd_line = [
-                "deepspeed",
-                "--num_nodes 1",
-                "--num_gpus 8",
-                "--no_local_rank",
-            ]
-        if self.EXAMPLE_NAME == "dpo":
-            cmd_line += [
-                f"{script}",
-                f"--model_name_or_path {model_name}",
-                f"--tokenizer_name_or_path {model_name}",
-                f"--output_dir {output_dir}",
-                f"--per_device_train_batch_size {train_batch_size}",
-                f"--per_device_eval_batch_size {eval_batch_size}",
-            ]
-        else:
-            cmd_line += [
-                f"{script}",
-                f"--model_name_or_path {model_name}",
-                f"--gaudi_config_name {gaudi_config_name}",
-                f"{task_option}",
-                "--do_train",
-                f"--output_dir {output_dir}",
-                "--overwrite_output_dir",
-                f"--learning_rate {lr}",
-                f"--per_device_train_batch_size {train_batch_size}",
-                f"--per_device_eval_batch_size {eval_batch_size}",
-                f" --num_train_epochs {num_epochs}",
-                "--use_habana",
-                "--throughput_warmup_steps 3",
-                "--save_strategy no",
-            ]
-
-        if "compile" in task:
-            cmd_line += ["--use_lazy_mode False"]
-        elif self.EXAMPLE_NAME != "dpo":
-            cmd_line += ["--use_lazy_mode"]
-
-        if "bloom" not in model_name and self.EXAMPLE_NAME != "dpo":
-            cmd_line.append("--do_eval")
-
-        if extra_command_line_arguments is not None:
-            cmd_line += extra_command_line_arguments
-
-        pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
-        return [x for y in cmd_line for x in re.split(pattern, y) if x]
-
-    def _install_requirements(self, requirements_filename: Union[str, os.PathLike]):
-        """
-        Installs the necessary requirements to run the example if the provided file exists, otherwise does nothing.
-        """
-
-        if not Path(requirements_filename).exists():
-            return
-
-        cmd_line = f"pip install -r {requirements_filename}".split()
-        p = subprocess.Popen(cmd_line)
-        return_code = p.wait()
-        self.assertEqual(return_code, 0)
-
-    def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str):
-        """
-        Assert whether all possible performance requirements are met.
-        Attributes:
-            results (Dict): results of the run to assess
-            baseline (Dict): baseline to assert whether or not there is regression
-        """
-        # Gather all the metrics to assess
-        metrics_to_assess = []
-        for metric_name in self.REGRESSION_METRICS.keys():
-            if metric_name in baseline and metric_name in results:
-                metrics_to_assess.append(metric_name)
-
-        # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM
-        min_number_metrics = 3
-        if self.EXAMPLE_NAME in ["run_clip", "run_bridgetower", "sft", "dpo"] or "bloom" in model_name:
-            min_number_metrics = 2
-
-        # Check that at least 3 metrics are assessed:
-        # training time + throughput + accuracy metric (F1, accuracy, perplexity,...)
-        self.assertGreaterEqual(
-            len(metrics_to_assess),
-            min_number_metrics,
-            (
-                f"{len(metrics_to_assess)} asserted metric(s) while at least 3 are expected (throughput + training"
-                f" time + accuracy). Metrics to assert: {self.REGRESSION_METRICS.keys()}. Metrics received:"
-                f" {baseline.keys()}"
-            ),
-        )
-
-        # Message to display if one test fails
-        # This enables to show all the results and baselines even if one test fails before others
-        failure_message = "\n===== Assessed metrics (measured vs thresholded baseline) =====\n"
-        for metric_name in metrics_to_assess:
-            failure_message += f"{metric_name}: {results[metric_name]} vs {self.REGRESSION_METRICS[metric_name][1] * baseline[metric_name]}\n"
-
-        # Assess metrics
-        for metric_name in metrics_to_assess:
-            assert_function, threshold_factor = self.REGRESSION_METRICS[metric_name]
-            assert_function(
-                self,
-                results[metric_name],
-                threshold_factor * baseline[metric_name],
-                msg=f"for metric {metric_name}. {failure_message}",
-            )
-
-
-class TextClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_glue"):
-    TASK_NAME = "mrpc"
-    DATASET_PARAMETER_NAME = "task_name"
-
-
-class MultiCardTextClassificationExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_glue", multi_card=True
-):
-    TASK_NAME = "mrpc"
-    DATASET_PARAMETER_NAME = "task_name"
-
-
-class DeepSpeedTextClassificationExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_glue", deepspeed=True
-):
-    TASK_NAME = "mrpc"
-    DATASET_PARAMETER_NAME = "task_name"
-
-
-class QuestionAnsweringExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_qa", torch_compile=True
-):
-    TASK_NAME = "squad"
-
-
-class MultiCardQuestionAnsweringExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_qa", multi_card=True, torch_compile=True
-):
-    TASK_NAME = "squad"
-
-
-class CausalLanguageModelingExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clm"):
-    TASK_NAME = "wikitext"
-
-
-class MultiCardCausalLanguageModelingExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clm", multi_card=True
-):
-    TASK_NAME = "wikitext"
-
-
-class DeepspeedCausalLanguageModelingExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clm", deepspeed=True
-):
-    TASK_NAME = "wikitext"
-
-
-class ImageClassificationExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_image_classification"
-):
-    TASK_NAME = "cifar10"
-
-
-class MultiCardImageClassificationExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_image_classification", multi_card=True
-):
-    TASK_NAME = "cifar10"
-
-
-class MultiCardMaskedLanguageModelingExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_mlm", multi_card=True
-):
-    TASK_NAME = "wikitext"
-
-
-class MultiCardAudioClassificationExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_audio_classification", multi_card=True
-):
-    TASK_NAME = "common_language"
-
-
-class MultiCardSpeechRecognitionExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_speech_recognition_ctc", multi_card=True
-):
-    TASK_NAME = "regisss/librispeech_asr_for_optimum_habana_ci"
-    DATASET_NAME = os.environ.get("DATA_CACHE", None)
-
-
-class MultiCardSummarizationExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_summarization", multi_card=True
-):
-    TASK_NAME = "cnn_dailymail"
-
-
-class DeepspeedSummarizationExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_summarization", deepspeed=True
-):
-    TASK_NAME = "cnn_dailymail"
-
-
-class MultiCardSeq2SeqQuestionAnsweringExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_seq2seq_qa", multi_card=True
-):
-    TASK_NAME = "squad_v2"
-
-
-class MultiCardVisionLanguageExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clip", multi_card=True
-):
-    TASK_NAME = "ydshieh/coco_dataset_script"
-
-
-class ProteinFoldingExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_esmfold"):
-    pass
-
-
-class ProteinFoldingExampleTester2(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_zero_shot_eval"):
-    pass
-
-
-class MultiCardCausalLanguageModelingLORAExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
-):
-    TASK_NAME = ["tatsu-lab/alpaca", "timdettmers/openassistant-guanaco"]
-
-
-class MultiCardBridgetowerExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_bridgetower", multi_card=True
-):
-    TASK_NAME = "jmhessel/newyorker_caption_contest"
-
-
-class MultiCardSeq2SeqSpeechRecognitionExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_speech_recognition_seq2seq", multi_card=True
-):
-    TASK_NAME = "mozilla-foundation/common_voice_11_0"
-
-
-class MultiCardCausalLanguageModelingLORAFSDPCompileExampleTester(
-    ExampleTesterBase,
-    metaclass=ExampleTestMeta,
-    example_name="run_lora_clm",
-    multi_card=True,
-    fsdp=True,
-):
-    TASK_NAME = "tatsu-lab/alpaca_fsdpcompile"
-    DATASET_NAME = "tatsu-lab/alpaca"
-
-
-class MultiCardCausalLanguageModelingLoRAFP8ExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True, fp8=True
-):
-    TASK_NAME = "tatsu-lab/alpaca_fp8"
-    DATASET_NAME = "tatsu-lab/alpaca"
-
-
-class MultiCardSFTExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="sft", multi_card=True):
-    TASK_NAME = "trl-sft"
-    DATASET_NAME = "lvwerra/stack-exchange-paired"
-
-
-class MultiCardDPOExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="dpo", multi_card=True):
-    TASK_NAME = "trl-dpo"
-    DATASET_NAME = "lvwerra/stack-exchange-paired"
-
-
-class MultiCardProteinFoldingClassificationTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_sequence_classification", multi_card=True
-):
-    TASK_NAME = "prost-sequence-classification"
-    DATASET_NAME = "mila-intel/ProtST-BinaryLocalization"
-
-
-class MultiCardCausalLanguageModelingPromptTuningExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_prompt_tuning_clm", multi_card=True
-):
-    TASK_NAME = ["prompt-tuning"]
-    DATASET_NAME = "ought/raft"
-
-
-class MultiCardCausalLanguageModelingPrefixTuningExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_prompt_tuning_clm", multi_card=True
-):
-    TASK_NAME = ["prefix-tuning"]
-    DATASET_NAME = "ought/raft"
-
-
-class MultiCardCausalLanguageModelingPTuningExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_prompt_tuning_clm", multi_card=True
-):
-    TASK_NAME = ["p-tuning"]
-    DATASET_NAME = "ought/raft"
diff --git a/tests/test_examples_match_transformers.py b/tests/test_examples_match_transformers.py
deleted file mode 100644
index f883efbf29..0000000000
--- a/tests/test_examples_match_transformers.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-from os import PathLike
-from pathlib import Path
-from typing import List, Tuple, Union
-
-import pytest
-from git import Repo
-
-from .create_diff_file_for_example import DIFF_DIRECTORY, diff
-
-
-TRANSFORMERS_REPO_URL = "https://github.com/huggingface/transformers.git"
-TRANSFORMERS_REPO_PATH = Path("transformers")
-
-
-def get_examples(
-    transformers_example_dir: Union[str, PathLike],
-    optimum_example_dir: Union[str, PathLike],
-    include_readmes: bool = False,
-) -> List[Tuple[str]]:
-    """Retrieves the common example filenames between the transformers and the optimum-habana repos."""
-    # TODO: validate for include README.
-    glob_pattern = "*/run_*.py" if not include_readmes else "*/(run_*|README).(py|md)"
-
-    transformers_files = list(Path(transformers_example_dir).glob(glob_pattern))
-    transformer_example_names = {p.name for p in transformers_files}
-    optimum_files = list(Path(optimum_example_dir).glob(glob_pattern))
-    optimum_example_names = {p.name for p in optimum_files}
-
-    transformer_files = sorted(p for p in transformers_files if p.name in optimum_example_names)
-    optimum_files = sorted(p for p in optimum_files if p.name in transformer_example_names)
-
-    return list(zip(transformer_files, optimum_files))
-
-
-cloned_repo = Repo.clone_from(TRANSFORMERS_REPO_URL, TRANSFORMERS_REPO_PATH)
-EXAMPLES = get_examples(TRANSFORMERS_REPO_PATH / "examples" / "pytorch", "examples")
-
-
-@pytest.mark.parametrize("filename1,filename2", EXAMPLES, ids=lambda filename: str(filename.name))
-def test_diff_match(filename1: Path, filename2: Path):
-    reference_diff_filename = DIFF_DIRECTORY / f"{filename1.stem}.txt"
-    try:
-        with open(reference_diff_filename) as fp:
-            reference_diff = fp.read()
-    except FileNotFoundError:
-        raise FileNotFoundError(
-            f"Could not find the reference diff file for example {filename1.name}, you can create it manually or with"
-            " the command line tool located at: optimum-habana/tests/create_diff_file_for_example.py"
-        )
-
-    current_diff = diff(filename1, filename2)
-    assert reference_diff == current_diff
-
-
-@pytest.fixture(scope="session", autouse=True)
-def cleanup(request):
-    # A bit hacky: this fixture will be called twice: at the beginning of the session, and at the end.
-    # The goal is to cleanup the transformers repository at the end of the test session.
-    # To do that, we first do nothing (yield some random value), which is executed at the beginning of the session, and
-    # then remove the repo, which is executed at the end of the session.
-    yield True
-    shutil.rmtree(TRANSFORMERS_REPO_PATH)
diff --git a/tests/test_fp8_examples.py b/tests/test_fp8_examples.py
deleted file mode 100644
index 54f5a7a63b..0000000000
--- a/tests/test_fp8_examples.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import json
-import os
-import re
-import subprocess
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import pytest
-
-from .test_examples import ACCURACY_PERF_FACTOR, TIME_PERF_FACTOR
-
-
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = {
-        "fp8": [
-            (
-                "mistralai/Mistral-7B-Instruct-v0.2",
-                "tatsu-lab/alpaca",
-                "",
-                12.373,
-                0.7538,
-                "language-modeling",
-                8,
-                8,
-                "run_lora_clm.py",
-            ),
-        ],
-    }
-else:
-    # FP8 is not supported on Gaudi1
-    MODELS_TO_TEST = {"fp8": []}
-
-
-def _test_fp8_train(
-    model_name: str,
-    dataset_name: str,
-    gaudi_config: str,
-    baseline: float,
-    baseline_acc: float,
-    task: str,
-    batch_size_train: int,
-    batch_size_eval: int,
-    script: str,
-    token: str,
-    world_size: int = 8,
-):
-    path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
-
-    # Install question-answering example requirements
-    cmd_line = f"pip install -r {path_to_example_dir / task / 'requirements.txt'}".split()
-    p = subprocess.Popen(cmd_line)
-    return_code = p.wait()
-    assert return_code == 0
-
-    command = ["python3"]
-
-    command += [
-        f"{path_to_example_dir / task / script}",
-        f"--model_name_or_path {model_name}",
-        f"--dataset_name {dataset_name}",
-        "--do_train",
-        "--do_eval",
-        f"--per_device_eval_batch_size {batch_size_eval}",
-        f"--per_device_train_batch_size {batch_size_train}",
-        "--use_habana",
-        "--use_lazy_mode",
-        "--fp8 True",
-    ]
-
-    if model_name == "mistralai/Mistral-7B-Instruct-v0.2":
-        command += [
-            "--num_train_epochs 3",
-            "--evaluation_strategy no",
-            "--save_strategy no",
-            "--learning_rate 4e-4",
-            "--warmup_ratio 0.03",
-            "--lr_scheduler_type constant",
-            "--max_grad_norm 0.3",
-            "--logging_steps 1",
-            "--throughput_warmup_steps 5",
-            "--lora_rank 8",
-            "--lora_target_modules v_proj q_proj",
-            "--lora_alpha 16",
-            "--lora_dropout 0.05",
-            "--dataset_concatenation",
-            "--max_seq_length 512",
-            "--low_cpu_mem_usage True",
-            "--validation_split_percentage 4",
-            "--adam_epsilon 1e-08",
-            f"--token {token.value}",
-        ]
-
-    with TemporaryDirectory() as tmp_dir:
-        command.append(f"--output_dir {tmp_dir}")
-        print(f"\n\nCommand to test: {' '.join(command)}\n")
-
-        pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
-        command = [x for y in command for x in re.split(pattern, y) if x]
-
-        proc = subprocess.run(command)
-
-        # Ensure the run finished without any issue
-        # Use try-except to avoid logging the token if used
-        try:
-            assert proc.returncode == 0
-        except AssertionError as e:
-            if "'--token', 'hf_" in e.args[0]:
-                e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
-            raise
-
-        with open(Path(tmp_dir) / "all_results.json") as fp:
-            results = json.load(fp)
-
-        # Ensure performance requirements (throughput) are met
-        assert results["train_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline
-        assert results["eval_accuracy"] >= ACCURACY_PERF_FACTOR * baseline_acc
-
-
-@pytest.mark.parametrize(
-    "model_name, dataset_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script",
-    MODELS_TO_TEST["fp8"],
-)
-def test_fp8_train(
-    model_name: str,
-    dataset_name: str,
-    gaudi_config: str,
-    baseline: float,
-    baseline_acc: float,
-    task: str,
-    bs_train: int,
-    bs_eval: int,
-    script: str,
-    token: str,
-):
-    _test_fp8_train(
-        model_name, dataset_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, token
-    )
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
deleted file mode 100644
index b3af2a05cc..0000000000
--- a/tests/test_fsdp_examples.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import json
-import os
-import re
-import subprocess
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import pytest
-
-from .test_examples import ACCURACY_PERF_FACTOR, TIME_PERF_FACTOR
-
-
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = {
-        "bf16": [
-            (
-                "bert-base-uncased",
-                "Habana/bert-base-uncased",
-                3516.322,
-                85.5503,
-                "question-answering",
-                24,
-                8,
-                "run_qa.py",
-                "full_shard",
-            ),
-            (
-                "meta-llama/Llama-2-7b-hf",
-                "",
-                85.016,
-                0.9093,
-                "language-modeling",
-                8,
-                8,
-                "run_lora_clm.py",
-                "auto_wrap",
-            ),
-        ],
-    }
-else:
-    # FSDP is not supported on Gaudi1
-    MODELS_TO_TEST = {"bf16": []}
-
-
-def _test_fsdp(
-    model_name: str,
-    gaudi_config: str,
-    baseline: float,
-    baseline_acc: float,
-    task: str,
-    batch_size_train: int,
-    batch_size_eval: int,
-    script: str,
-    policy: str,
-    token: str,
-    world_size: int = 8,
-):
-    os.environ["PT_HPU_LAZY_MODE"] = "0"
-    path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
-
-    # Install question-answering example requirements
-    cmd_line = f"pip install -r {path_to_example_dir / task / 'requirements.txt'}".split()
-    p = subprocess.Popen(cmd_line)
-    return_code = p.wait()
-    assert return_code == 0
-
-    command = ["python3"]
-
-    command += [
-        f"{path_to_example_dir / 'gaudi_spawn.py'}",
-        "--use_mpi",
-        f"--world_size {world_size}",
-        f"{path_to_example_dir / task / script}",
-        f"--model_name_or_path {model_name}",
-        "--do_train",
-        f"--per_device_eval_batch_size {batch_size_eval}",
-        f"--per_device_train_batch_size {batch_size_train}",
-        f"--fsdp_config {path_to_example_dir / task / 'fsdp_config.json'}",
-        f"--fsdp '{policy}'",
-        "--torch_compile_backend hpu_backend",
-        "--torch_compile",
-        "--use_habana",
-    ]
-
-    if model_name == "bert-base-uncased":
-        command += [
-            "--dataset_name squad",
-            "--max_seq_length 384",
-            "--learning_rate 3e-05",
-            "--num_train_epochs 2.0",
-            "--logging_steps 20",
-            "--save_steps 5000",
-            "--seed 42",
-            "--doc_stride 128",
-            "--overwrite_output_dir",
-            f"--gaudi_config_name {gaudi_config}",
-            "--throughput_warmup_steps 100",
-            "--do_eval",
-        ]
-    else:
-        command += [
-            "--dataset_name tatsu-lab/alpaca ",
-            "--bf16 True ",
-            "--gradient_accumulation_steps 2",
-            "--save_strategy 'no'",
-            "--evaluation_strategy 'no'",
-            "--learning_rate 0.0003",
-            "--warmup_ratio 0.03",
-            "--max_grad_norm 0.3",
-            "--lr_scheduler_type 'constant'",
-            "--logging_steps 1",
-            "--use_lazy_mode False",
-            "--pipelining_fwd_bwd False",
-            "--throughput_warmup_steps 3",
-            "--lora_rank 8",
-            "--lora_alpha 16",
-            "--lora_dropout 0.05",
-            "--lora_target_modules 'q_proj' 'v_proj'",
-            "--dataset_concatenation",
-            "--max_seq_length 512",
-            "--adam_epsilon 1e-08",
-            "--low_cpu_mem_usage True",
-            "--attn_softmax_bf16 True",
-            "--num_train_epochs 3",
-            "--use_flash_attention True",
-            "--flash_attention_causal_mask True",
-            f"--token {token.value}",
-        ]
-
-    with TemporaryDirectory() as tmp_dir:
-        command.append(f"--output_dir {tmp_dir}")
-        print(f"\n\nCommand to test: {' '.join(command)}\n")
-
-        pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
-        command = [x for y in command for x in re.split(pattern, y) if x]
-
-        proc = subprocess.run(command)
-
-        # Ensure the run finished without any issue
-        # Use try-except to avoid logging the token if used
-        try:
-            assert proc.returncode == 0
-        except AssertionError as e:
-            if "'--token', 'hf_" in e.args[0]:
-                e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
-            raise
-
-        with open(Path(tmp_dir) / "all_results.json") as fp:
-            results = json.load(fp)
-
-        # Ensure performance requirements (throughput) are met
-        assert results["train_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline
-        if model_name == "bert-base-uncased":
-            assert results["eval_f1"] >= ACCURACY_PERF_FACTOR * baseline_acc
-        else:
-            assert results["train_loss"] <= baseline_acc
-
-
-@pytest.mark.parametrize(
-    "model_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, policy", MODELS_TO_TEST["bf16"]
-)
-def test_fsdp_bf16(
-    model_name: str,
-    gaudi_config: str,
-    baseline: float,
-    baseline_acc: float,
-    task: str,
-    bs_train: int,
-    bs_eval: int,
-    script: str,
-    policy: str,
-    token: str,
-):
-    _test_fsdp(model_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, policy, token)
diff --git a/tests/test_gaudi_configuration.py b/tests/test_gaudi_configuration.py
deleted file mode 100644
index 8dcb31c738..0000000000
--- a/tests/test_gaudi_configuration.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import filecmp
-import tempfile
-import unittest
-from pathlib import Path
-
-from optimum.habana import GaudiConfig
-
-
-BF16_OPS_REFERENCE_FILE = Path(__file__).parent.resolve() / Path("configs/bf16_ops.txt")
-FP32_OPS_REFERENCE_FILE = Path(__file__).parent.resolve() / Path("configs/fp32_ops.txt")
-
-
-class GaudiConfigTester(unittest.TestCase):
-    """
-    Unit tests for Gaudi configuration class GaudiConfig.
-    """
-
-    def test_default_parameter_types(self):
-        gaudi_config = GaudiConfig()
-
-        self.assertIsInstance(gaudi_config.use_fused_adam, bool)
-        self.assertIsInstance(gaudi_config.use_fused_clip_norm, bool)
-        self.assertIsInstance(gaudi_config.use_torch_autocast, bool)
-
-        self.assertIsNone(gaudi_config.autocast_bf16_ops)
-        self.assertIsNone(gaudi_config.autocast_fp32_ops)
-
-    def test_write_bf16_fp32_ops_to_text_files(self):
-        gaudi_config = GaudiConfig(
-            autocast_bf16_ops=[
-                "add",
-                "addmm",
-                "bmm",
-                "div",
-                "dropout",
-                "gelu",
-                "iadd",
-                "linear",
-                "layer_norm",
-                "matmul",
-                "mm",
-                "rsub",
-                "softmax",
-                "truediv",
-            ],
-            autocast_fp32_ops=[
-                "embedding",
-                "nll_loss",
-                "log_softmax",
-            ],
-        )
-
-        with tempfile.NamedTemporaryFile() as bf16_file:
-            with tempfile.NamedTemporaryFile() as fp32_file:
-                gaudi_config.write_bf16_fp32_ops_to_text_files(
-                    bf16_file.name,
-                    fp32_file.name,
-                )
-
-                self.assertTrue(
-                    filecmp.cmp(
-                        bf16_file.name,
-                        BF16_OPS_REFERENCE_FILE,
-                        shallow=False,
-                    )
-                )
-                self.assertTrue(
-                    filecmp.cmp(
-                        fp32_file.name,
-                        FP32_OPS_REFERENCE_FILE,
-                        shallow=False,
-                    )
-                )
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py
deleted file mode 100644
index 85982ce7c7..0000000000
--- a/tests/test_image_to_text_example.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import json
-import os
-import re
-import subprocess
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import pytest
-
-from .test_examples import TIME_PERF_FACTOR
-
-
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = {
-        "bf16": [
-            ("llava-hf/llava-1.5-7b-hf", 1, 87.2901500056982),
-            ("llava-hf/llava-1.5-13b-hf", 1, 54.41252589197953),
-            ("llava-hf/llava-v1.6-mistral-7b-hf", 1, 33.17984878151546),
-            ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 23.527610042925),
-        ],
-        "fp8": [
-            ("llava-hf/llava-1.5-7b-hf", 1, 123.00953973789325),
-            ("llava-hf/llava-1.5-13b-hf", 1, 82.81132373492122),
-        ],
-    }
-else:
-    # Gaudi1 CI baselines
-    MODELS_TO_TEST = {
-        "bf16": [
-            ("llava-hf/llava-1.5-7b-hf", 1, 28.04096918512148),
-            ("llava-hf/llava-1.5-13b-hf", 1, 16.704731010481538),
-            ("llava-hf/llava-v1.6-mistral-7b-hf", 1, 10.759228696741),
-            ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 6.96732060769783),
-        ],
-        "fp8": [],
-    }
-
-
-def _test_image_to_text(
-    model_name: str,
-    baseline: float,
-    token: str,
-    batch_size: int = 1,
-    fp8: bool = False,
-):
-    command = ["python3"]
-    path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
-    env_variables = os.environ.copy()
-
-    command += [
-        f"{path_to_example_dir / 'image-to-text' / 'run_pipeline.py'}",
-        f"--model_name_or_path {model_name}",
-        f"--batch_size {batch_size}",
-        "--max_new_tokens 20",
-    ]
-
-    command += [
-        "--use_hpu_graphs",
-    ]
-
-    command.append("--bf16")
-
-    with TemporaryDirectory() as tmp_dir:
-        command.append(f"--output_dir {tmp_dir}")
-        print(f"\n\nCommand to test: {' '.join(command)}\n")
-
-        command.append(f"--token {token.value}")
-
-        pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
-        command = [x for y in command for x in re.split(pattern, y) if x]
-
-        if fp8:
-            print(f"\n\nCommand to test: {' '.join(command)}\n")
-            env_variables["QUANT_CONFIG"] = os.path.join(
-                path_to_example_dir, "image-to-text/quantization_config/maxabs_measure_include_outputs.json"
-            )
-            subprocess.run(command, env=env_variables)
-            env_variables["QUANT_CONFIG"] = os.path.join(
-                path_to_example_dir, "image-to-text/quantization_config/maxabs_quant.json"
-            )
-
-        proc = subprocess.run(command, env=env_variables)
-
-        # Ensure the run finished without any issue
-        # Use try-except to avoid logging the token if used
-        try:
-            assert proc.returncode == 0
-        except AssertionError as e:
-            if "'--token', 'hf_" in e.args[0]:
-                e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
-            raise
-
-        with open(Path(tmp_dir) / "results.json") as fp:
-            results = json.load(fp)
-
-        # Ensure performance requirements (throughput) are met
-        assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline
-
-
-@pytest.mark.parametrize("model_name, batch_size, baseline", MODELS_TO_TEST["bf16"])
-def test_image_to_text_bf16(model_name: str, baseline: float, batch_size: int, token: str):
-    _test_image_to_text(model_name, baseline, token, batch_size)
-
-
-@pytest.mark.parametrize("model_name, batch_size, baseline", MODELS_TO_TEST["fp8"])
-def test_image_to_text_fp8(model_name: str, baseline: float, batch_size: int, token: str):
-    _test_image_to_text(model_name, baseline, token, batch_size, fp8=True)
diff --git a/tests/test_object_segmentation.py b/tests/test_object_segmentation.py
deleted file mode 100644
index ab044c4163..0000000000
--- a/tests/test_object_segmentation.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-from unittest import TestCase
-
-import habana_frameworks.torch as ht
-import numpy as np
-import requests
-import torch
-from PIL import Image
-from transformers import AutoModel, AutoProcessor
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-adapt_transformers_to_gaudi()
-
-# For Gaudi 2
-LATENCY_ClipSeg_BF16_GRAPH_BASELINE = 5.3107380867004395
-
-
-class GaudiClipSegTester(TestCase):
-    """
-    Tests for ClipSeg model
-    """
-
-    def prepare_model_and_processor(self):
-        model = AutoModel.from_pretrained("CIDAS/clipseg-rd64-refined").to("hpu")
-        processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
-        model = model.eval()
-        return model, processor
-
-    def prepare_data(self):
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        image = Image.open(requests.get(url, stream=True).raw)
-        texts = ["a cat", "a remote", "a blanket"]
-        return texts, image
-
-    def test_inference_default(self):
-        model, processor = self.prepare_model_and_processor()
-        texts, image = self.prepare_data()
-        inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to("hpu")
-        outputs = model(**inputs)
-        probs = outputs.logits_per_image.softmax(dim=-1).detach().cpu().numpy()[0]
-        expected_scores = np.array([0.02889409, 0.87959206, 0.09151383])  # from CPU
-        self.assertEqual(len(probs), 3)
-        self.assertLess(np.abs(probs - expected_scores).max(), 0.01)
-
-    def test_inference_autocast(self):
-        model, processor = self.prepare_model_and_processor()
-        texts, image = self.prepare_data()
-        inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to("hpu")
-
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16):  # Autocast BF16
-            outputs = model(**inputs)
-            probs = outputs.logits_per_image.softmax(dim=-1).to(torch.float32).detach().cpu().numpy()[0]
-            expected_scores = np.array([0.02889409, 0.87959206, 0.09151383])  # from CPU
-            self.assertEqual(len(probs), 3)
-            self.assertEqual(probs.argmax(), expected_scores.argmax())
-
-    def test_inference_hpu_graphs(self):
-        model, processor = self.prepare_model_and_processor()
-        texts, image = self.prepare_data()
-        inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to("hpu")
-
-        model = ht.hpu.wrap_in_hpu_graph(model)  # Apply graph
-
-        outputs = model(**inputs)
-        probs = outputs.logits_per_image.softmax(dim=-1).to(torch.float32).detach().cpu().numpy()[0]
-        expected_scores = np.array([0.02889409, 0.87959206, 0.09151383])  # from CPU
-        self.assertEqual(len(probs), 3)
-        self.assertEqual(probs.argmax(), expected_scores.argmax())
-
-    def test_no_latency_regression_autocast(self):
-        warmup = 3
-        iterations = 20
-
-        model, processor = self.prepare_model_and_processor()
-        texts, image = self.prepare_data()
-
-        model = ht.hpu.wrap_in_hpu_graph(model)
-
-        with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
-            for i in range(warmup):
-                inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to(
-                    "hpu"
-                )
-                _ = model(**inputs)
-                torch.hpu.synchronize()
-
-            total_model_time = 0
-            for i in range(iterations):
-                inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to(
-                    "hpu"
-                )
-                model_start_time = time.time()
-                _ = model(**inputs)
-                torch.hpu.synchronize()
-                model_end_time = time.time()
-                total_model_time = total_model_time + (model_end_time - model_start_time)
-
-        latency = total_model_time * 1000 / iterations  # in terms of ms
-        self.assertLessEqual(latency, 1.05 * LATENCY_ClipSeg_BF16_GRAPH_BASELINE)
diff --git a/tests/test_peft_inference.py b/tests/test_peft_inference.py
deleted file mode 100644
index 205f9d4fd9..0000000000
--- a/tests/test_peft_inference.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from unittest import TestCase
-
-from peft import (
-    PrefixTuningConfig,
-    PromptEncoderConfig,
-    PromptTuningConfig,
-    TaskType,
-    get_peft_model,
-)
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-
-from optimum.habana.peft.peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-class GaudiPeftTester(TestCase):
-    def __init__(self, *args, **kwargs):
-        adapt_transformers_to_gaudi()
-        super().__init__(*args, **kwargs)
-
-    def _text_generation(self, model, tokenizer, extra_kwargs=None):
-        generate_kwargs = {
-            "lazy_mode": True,
-            "hpu_graphs": True,
-            "max_new_tokens": 128,
-            "ignore_eos": True,
-        }
-        generator = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            device="hpu",
-        )
-        output = generator("Hello, Boy", **generate_kwargs)
-        return output[0]["generated_text"]
-
-    def _test_text_generation(self, model_name_or_path, peft_method):
-        model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-        if peft_method == "prompt-tuning":
-            config = PromptTuningConfig(
-                task_type=TaskType.CAUSAL_LM,
-                num_virtual_tokens=8,
-            )
-        elif peft_method == "prefix-tuning":
-            config = PrefixTuningConfig(
-                task_type=TaskType.CAUSAL_LM,
-                num_virtual_tokens=8,
-            )
-        elif peft_method == "p-tuning":
-            config = PromptEncoderConfig(
-                task_type=TaskType.CAUSAL_LM,
-                num_virtual_tokens=8,
-            )
-
-        result = self._text_generation(model, tokenizer)
-        model = get_peft_model(model, config)
-        model.__class__.generate = gaudi_generate
-        model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation
-
-        result1 = self._text_generation(model, tokenizer)
-        self.assertNotEqual(result, result1)
-
-        result2 = self._text_generation(model, tokenizer, extra_kwargs={"reuse_cache": True})
-        self.assertEqual(result1, result2)
-
-        result3 = self._text_generation(model, tokenizer, extra_kwargs={"bucket_size": 10})
-        self.assertEqual(result1, result3)
-
-    def test_text_generation_llama(self):
-        self._test_text_generation("huggyllama/llama-7b", "prompt-tuning")
-        self._test_text_generation("huggyllama/llama-7b", "p-tuning")
-        self._test_text_generation("huggyllama/llama-7b", "prefix-tuning")
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
deleted file mode 100644
index f94ebe6a4f..0000000000
--- a/tests/test_pipeline.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import numpy as np
-import pytest
-import torch
-from datasets import load_dataset
-from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-from transformers import pipeline
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-MODELS_TO_TEST = {
-    "text-to-speech": [
-        ("microsoft/speecht5_tts", 16000),
-        ("facebook/hf-seamless-m4t-medium", 16000),
-        ("facebook/mms-tts-eng", 16000),
-    ],
-    "image-to-text": [
-        ("Salesforce/blip-image-captioning-base", "a soccer player is playing a game on the app"),
-        ("nlpconnect/vit-gpt2-image-captioning", "a soccer game with a player jumping to catch"),
-    ],
-}
-
-
-class TestGaudiPipeline:
-    @pytest.mark.parametrize("model, expected_result", MODELS_TO_TEST["image-to-text"])
-    def test_image_to_text(self, model, expected_result):
-        adapt_transformers_to_gaudi()
-        MODEL_DTYPE_LIST = [torch.bfloat16, torch.float32]
-        generate_kwargs = {
-            "lazy_mode": True,
-            "hpu_graphs": True,
-            "max_new_tokens": 128,
-            "ignore_eos": False,
-        }
-        image = os.path.dirname(__file__) + "/resource/img/image-captioning-example.png"
-        for model_dtype in MODEL_DTYPE_LIST:
-            generator = pipeline(
-                "image-to-text",
-                model=model,
-                torch_dtype=model_dtype,
-                device="hpu",
-            )
-            generator.model = wrap_in_hpu_graph(generator.model)
-            for i in range(3):
-                output = generator(image, generate_kwargs=generate_kwargs)
-            assert output[0]["generated_text"].startswith(expected_result)
-
-    @pytest.mark.parametrize("model, expected_sample_rate", MODELS_TO_TEST["text-to-speech"])
-    def test_text_to_speech(self, model, expected_sample_rate):
-        adapt_transformers_to_gaudi()
-        MODEL_DTYPE_LIST = [torch.bfloat16, torch.float32]
-        text = "hello, the dog is cooler"
-        for model_dtype in MODEL_DTYPE_LIST:
-            generator = pipeline(
-                "text-to-speech",
-                model=model,
-                torch_dtype=model_dtype,
-                device="hpu",
-            )
-            forward_params = None
-            if generator.model.config.model_type == "speecht5":
-                embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-                speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to("hpu")
-                forward_params = {"speaker_embeddings": speaker_embedding}
-            if generator.model.config.model_type == "seamless_m4t":
-                forward_params = {"tgt_lang": "eng"}
-
-            generate_kwargs = None
-            if generator.model.can_generate():
-                generate_kwargs = {"lazy_mode": True, "ignore_eos": False, "hpu_graphs": True}
-
-            generator.model = wrap_in_hpu_graph(generator.model)
-            with torch.autocast(
-                "hpu", torch.bfloat16, enabled=(model_dtype == torch.bfloat16)
-            ), torch.no_grad(), torch.inference_mode():
-                for i in range(3):
-                    output = generator(text, forward_params=forward_params, generate_kwargs=generate_kwargs)
-            assert isinstance(output["audio"], np.ndarray)
-            assert output["sampling_rate"] == expected_sample_rate
diff --git a/tests/test_sentence_transformers.py b/tests/test_sentence_transformers.py
deleted file mode 100644
index 90d97f3005..0000000000
--- a/tests/test_sentence_transformers.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import csv
-import gzip
-import os
-import time
-
-import pytest
-from sentence_transformers import SentenceTransformer, util
-
-from .test_examples import TIME_PERF_FACTOR
-
-
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = [
-        ("sentence-transformers/all-mpnet-base-v2", 762.5595168883357),
-        ("sentence-transformers/multi-qa-mpnet-base-dot-v1", 545.3360251829846),
-        ("sentence-transformers/all-distilroberta-v1", 958.5097903298335),
-        ("sentence-transformers/all-MiniLM-L12-v2", 3614.2610109716247),
-        ("sentence-transformers/multi-qa-distilbert-cos-v1", 944.6166139694299),
-        ("sentence-transformers/all-MiniLM-L6-v2", 2615.6975354038477),
-        ("sentence-transformers/multi-qa-MiniLM-L6-cos-v1", 1208.3672807492396),
-        ("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", 2392.1654748794062),
-        ("sentence-transformers/paraphrase-albert-small-v2", 3896.1911011860166),
-        ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 3558.0778715789693),
-        ("sentence-transformers/paraphrase-MiniLM-L3-v2", 5734.318427972881),
-        ("sentence-transformers/distiluse-base-multilingual-cased-v1", 3487.3319366004903),
-        ("sentence-transformers/distiluse-base-multilingual-cased-v2", 3807.2486282025716),
-    ]
-else:
-    # Gaudi1 CI baselines
-    MODELS_TO_TEST = [
-        ("sentence-transformers/all-mpnet-base-v2", 164.36556936723508),
-        ("sentence-transformers/multi-qa-mpnet-base-dot-v1", 116.82789535569364),
-        ("sentence-transformers/all-distilroberta-v1", 226.90237421623164),
-        ("sentence-transformers/all-MiniLM-L12-v2", 1252.6261862281467),
-        ("sentence-transformers/multi-qa-distilbert-cos-v1", 216.47035182888888),
-        ("sentence-transformers/all-MiniLM-L6-v2", 1109.160132821451),
-        ("sentence-transformers/multi-qa-MiniLM-L6-cos-v1", 471.14320842607674),
-        ("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", 518.4762252952173),
-        ("sentence-transformers/paraphrase-albert-small-v2", 1139.806075824319),
-        ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 1253.06776127632),
-        ("sentence-transformers/paraphrase-MiniLM-L3-v2", 3029.398417051629),
-        ("sentence-transformers/distiluse-base-multilingual-cased-v1", 947.844857744754),
-        ("sentence-transformers/distiluse-base-multilingual-cased-v2", 947.7317550605878),
-    ]
-
-
-def _test_sentence_transformers(
-    model_name: str,
-    baseline: float,
-):
-    model = SentenceTransformer(model_name)
-
-    nli_dataset_path = "/tmp/datasets/AllNLI.tsv.gz"
-    sentences = set()
-    max_sentences = 10000
-
-    # Download datasets if needed
-    if not os.path.exists(nli_dataset_path):
-        util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
-
-    with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
-        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-        for row in reader:
-            sentences.add(row["sentence1"])
-            if len(sentences) >= max_sentences:
-                break
-
-    sentences = list(sentences)
-
-    for i in range(2):
-        start_time = time.perf_counter()
-        _ = model.encode(sentences, batch_size=32)
-        end_time = time.perf_counter()
-        diff_time = end_time - start_time
-        measured_throughput = len(sentences) / diff_time
-    # Only assert the last measured throughtput as the first iteration is used as a warmup
-    assert measured_throughput >= (2 - TIME_PERF_FACTOR) * baseline
-
-
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST)
-def test_compute_embeddings_throughput(model_name: str, baseline: float):
-    _test_sentence_transformers(model_name, baseline)
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
deleted file mode 100644
index 76b6736a93..0000000000
--- a/tests/test_text_generation_example.py
+++ /dev/null
@@ -1,359 +0,0 @@
-import json
-import os
-import re
-import subprocess
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import pytest
-
-from .test_examples import TIME_PERF_FACTOR
-
-
-prev_quant_model_name = None
-prev_quant_rank = 0
-
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = {
-        "bf16": [
-            ("bigscience/bloomz-7b1", 1, False, 130.0472971205316),
-            ("gpt2-xl", 1, False, 281.8734689674413),
-            ("EleutherAI/gpt-j-6b", 1, False, 160.5823842101192),
-            ("EleutherAI/gpt-neox-20b", 1, False, 50.67672679310354),
-            ("meta-llama/Llama-2-7b-hf", 1, True, 141.25776956002076),
-            ("tiiuae/falcon-40b", 1, True, 25.202450111088346),
-            ("bigcode/starcoder", 1, False, 65.58632640700114),
-            ("Salesforce/codegen2-1B", 1, False, 446.4029486883532),
-            ("mosaicml/mpt-30b", 1, False, 36.06464336116623),
-            ("mistralai/Mistral-7B-v0.1", 1, True, 130.2172236767782),
-            ("mistralai/Mixtral-8x7B-v0.1", 1, False, 23.7931001677926),
-            ("microsoft/phi-2", 1, False, 224.72307766211117),
-            ("meta-llama/Meta-Llama-3-8B", 1, True, 129),
-            ("meta-llama/Llama-2-7b-hf", 512, True, 12808),
-            ("meta-llama/Llama-2-7b-hf", 512, False, 8711),  # in some cases like TGI, reuse_cache isnt used
-            ("stabilityai/stablelm-2-12b", 1, False, 74.8904496532218),
-            ("codellama/CodeLlama-34b-hf", 1, True, 32.644),
-            ("bigcode/starcoder2-3b", 1, False, 234.2649120507936),
-            ("adept/persimmon-8b-base", 4, False, 366.73968820698406),
-            ("Qwen/Qwen1.5-7B", 4, False, 488.82855464593257),
-            ("google/gemma-7b", 1, False, 109.70751574382221),
-        ],
-        "fp8": [
-            ("tiiuae/falcon-180B", 4, 950, True, 128, 128, 2506.68),
-            ("meta-llama/Llama-2-7b-hf", 1, 1230, False, 128, 128, 13152.7),
-            ("meta-llama/Llama-2-7b-hf", 1, 163, False, 128, 2048, 4774.7),
-            ("meta-llama/Llama-2-7b-hf", 1, 94, False, 2048, 128, 1293.3),
-            ("meta-llama/Llama-2-7b-hf", 1, 81, False, 2048, 2048, 1942.9),
-            ("meta-llama/Llama-2-70b-hf", 4, 3042, False, 128, 128, 5374.6),
-            ("meta-llama/Llama-2-70b-hf", 4, 750, False, 128, 2048, 7422.4),
-            ("meta-llama/Llama-2-70b-hf", 4, 207, False, 2048, 128, 568.5),
-            ("meta-llama/Llama-2-70b-hf", 8, 172, False, 2048, 2048, 4656.2),
-            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 896, True, 128, 128, 17068.965283763682),
-            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 128, 2048, 6979.225194247115),
-            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 2048, 128, 1681.4401450088983),
-            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 44, True, 2048, 2048, 3393.149396451692),
-            ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128, 39.26845661768185),
-            ("microsoft/phi-2", 1, 1, True, 128, 128, 254.08932787178165),
-        ],
-        "gptq": [
-            ("TheBloke/Llama-2-7b-Chat-GPTQ", 1, 10, False, 128, 2048, 456.7),
-        ],
-        "deepspeed": [
-            ("bigscience/bloomz", 36.77314954096159),
-            ("meta-llama/Llama-2-70b-hf", 64.10514998902435),
-            ("meta-llama/Meta-Llama-3-70B-Instruct", 64),
-            ("facebook/opt-66b", 28.48069266504111),
-        ],
-        "torch_compile": [
-            ("meta-llama/Llama-2-7b-hf", 102.27823420713148),
-        ],
-        "torch_compile_distributed": [
-            ("meta-llama/Llama-2-7b-hf", 39.72973199515235),
-        ],
-        "distributed_tp": [
-            ("meta-llama/Llama-2-7b-hf", 1345.2369318328463),
-        ],
-    }
-else:
-    # Gaudi1 CI baselines
-    MODELS_TO_TEST = {
-        "bf16": [
-            ("bigscience/bloomz-7b1", 1, False, 41.7555095197846),
-            ("gpt2-xl", 1, False, 142.11481820425706),
-            # TODO: fix OPT 6.7B
-            # ("facebook/opt-6.7b", 0.0),
-            ("EleutherAI/gpt-j-6b", 1, False, 50.79545107991805),
-            ("meta-llama/Llama-2-7b-hf", 1, True, 44.39616259946937),
-            ("tiiuae/falcon-7b", 1, True, 44.82870145718665),
-            ("bigcode/starcoder", 1, False, 15.945023767901013),
-            ("Salesforce/codegen2-1B", 1, False, 155.32071248826423),
-            ("mosaicml/mpt-7b", 1, False, 45.45168927038262),
-            ("mistralai/Mistral-7B-v0.1", 1, True, 41.21906841459711),
-            ("microsoft/phi-2", 1, False, 92.53083167241344),
-            ("google/gemma-7b", 1, False, 28.84284625836978),
-            ("stabilityai/stablelm-2-12b", 1, False, 26.80858949645992),
-            ("Qwen/Qwen1.5-7B", 1, False, 39.29068423087616),
-            ("adept/persimmon-8b-base", 1, False, 34.53559807384106),
-            ("bigcode/starcoder2-3b", 1, False, 82.09655684566117),
-        ],
-        "fp8": [],
-        "gptq": [],
-        "deepspeed": [
-            ("bigscience/bloomz-7b1", 31.994268212011505),
-        ],
-        "torch_compile": [],
-        "torch_compile_distributed": [],
-        "distributed_tp": [],
-    }
-
-
-def _test_text_generation(
-    model_name: str,
-    baseline: float,
-    token: str,
-    batch_size: int = 1,
-    reuse_cache: bool = False,
-    deepspeed: bool = False,
-    world_size: int = 8,
-    torch_compile: bool = False,
-    fp8: bool = False,
-    gptq: bool = False,
-    max_input_tokens: int = 0,
-    max_output_tokens: int = 100,
-    parallel_strategy: str = None,
-):
-    command = ["python3"]
-    path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
-    env_variables = os.environ.copy()
-
-    if deepspeed:
-        command += [
-            f"{path_to_example_dir / 'gaudi_spawn.py'}",
-            "--use_deepspeed",
-            f"--world_size {world_size}",
-        ]
-    elif parallel_strategy == "tp":
-        command += [
-            f"{path_to_example_dir / 'gaudi_spawn.py'}",
-            f"--world_size {world_size}",
-        ]
-
-    command += [
-        f"{path_to_example_dir / 'text-generation' / 'run_generation.py'}",
-        f"--model_name_or_path {model_name}",
-        f"--batch_size {batch_size}",
-        "--use_kv_cache",
-        f"--max_new_tokens {max_output_tokens}",
-    ]
-
-    if "llama" in model_name.lower():
-        command += ["--trim_logits", "--attn_softmax_bf16"]
-
-    if "falcon" in model_name.lower():
-        command += ["--use_flash_attention", "--flash_attention_causal_mask"]
-
-    if (reuse_cache or torch_compile) and not parallel_strategy == "tp":
-        command += ["--reuse_cache"]
-
-    if torch_compile:
-        command += ["--torch_compile"]
-        if parallel_strategy == "tp":
-            command += ["--use_flash_attention"]
-            command += ["--flash_attention_recompute"]
-        env_variables["PT_ENABLE_INT64_SUPPORT"] = "1"
-        env_variables["PT_HPU_LAZY_MODE"] = "0"
-    else:
-        command += [
-            "--use_hpu_graphs",
-        ]
-
-    if not deepspeed:
-        command.append("--bf16")
-
-    if fp8:
-        if "--trim_logits" not in command:
-            command += ["--trim_logits"]
-        if "Llama-2" in model_name:
-            command.insert(-2, "--use_flash_attention")
-            command.insert(-2, "--flash_attention_recompute")
-            command.insert(-2, "--bucket_size 128")
-            command.insert(-2, "--bucket_internal")
-        if "Mistral" in model_name:
-            command.insert(-2, "--use_flash_attention")
-            command.insert(-2, "--flash_attention_recompute")
-            command.insert(-2, "--attn_softmax_bf16")
-            command.insert(-2, "--trim_logits")
-        elif "falcon-180b" in model_name.lower():
-            command.insert(-2, "--flash_attention_recompute")
-
-        global prev_quant_model_name
-        global prev_quant_rank
-        measure_command = None
-        # FP8 Measurement only needed
-        if (prev_quant_model_name is None) or (prev_quant_model_name != model_name) or (prev_quant_rank != world_size):
-            measure_command = [
-                x for x in command if not x.startswith("--max_new_tokens")
-            ]  # Remove max_new_tokens for measurement
-            measure_command = [
-                x if not x.startswith("--batch_size") else "--batch_size 1" for x in measure_command
-            ]  # Remove batch_size for measurement
-
-            prev_quant_model_name = model_name
-            prev_quant_rank = world_size
-
-        # FP8 text generation
-        command += [
-            f"--max_input_tokens {max_input_tokens}",
-            "--limit_hpu_graphs",
-        ]
-
-    if gptq:
-        command += ["--gptq"]
-
-    if parallel_strategy is not None:
-        command += [
-            f"--parallel_strategy={parallel_strategy}",
-        ]
-
-    with TemporaryDirectory() as tmp_dir:
-        command.append(f"--output_dir {tmp_dir}")
-        command.append(f"--token {token.value}")
-
-        pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
-
-        if fp8:
-            env_variables["TQDM_DISABLE"] = "1"
-            if measure_command is not None:
-                measure_command.append(f"--token {token.value}")
-                env_variables["QUANT_CONFIG"] = os.path.join(
-                    path_to_example_dir, "text-generation/quantization_config/maxabs_measure_include_outputs.json"
-                )
-                measure_command = [x for y in measure_command for x in re.split(pattern, y) if x]
-                print(f"\n\nMeasure Command to test: {' '.join(measure_command[:-2])}\n")
-                proc = subprocess.run(measure_command, env=env_variables)
-
-                # Ensure the run finished without any issue
-                # Use try-except to avoid logging the token if used
-                try:
-                    assert proc.returncode == 0
-                except AssertionError as e:
-                    if "'--token', 'hf_" in e.args[0]:
-                        e.args = (f"The following command failed:\n{' '.join(measure_command[:-2])}",)
-                    raise
-
-            env_variables["QUANT_CONFIG"] = os.path.join(
-                path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json"
-            )
-
-        command = [x for y in command for x in re.split(pattern, y) if x]
-        print(f"\n\nCommand to test: {' '.join(command[:-2])}\n")
-        proc = subprocess.run(command, env=env_variables)
-
-        # Ensure the run finished without any issue
-        # Use try-except to avoid logging the token if used
-        try:
-            assert proc.returncode == 0
-        except AssertionError as e:
-            if "'--token', 'hf_" in e.args[0]:
-                e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
-            raise
-
-        with open(Path(tmp_dir) / "results.json") as fp:
-            results = json.load(fp)
-
-        # Ensure performance requirements (throughput) are met
-        assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline
-
-
-@pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline", MODELS_TO_TEST["bf16"])
-def test_text_generation_bf16(model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str):
-    _test_text_generation(model_name, baseline, token, batch_size, reuse_cache)
-
-
-@pytest.mark.parametrize(
-    "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline", MODELS_TO_TEST["fp8"]
-)
-def test_text_generation_fp8(
-    model_name: str,
-    baseline: float,
-    world_size: int,
-    batch_size: int,
-    reuse_cache: bool,
-    input_len: int,
-    output_len: int,
-    token: str,
-):
-    deepspeed = True if world_size > 1 else False
-    _test_text_generation(
-        model_name,
-        baseline,
-        token,
-        deepspeed=deepspeed,
-        world_size=world_size,
-        fp8=True,
-        batch_size=batch_size,
-        reuse_cache=reuse_cache,
-        max_input_tokens=input_len,
-        max_output_tokens=output_len,
-    )
-
-
-@pytest.mark.parametrize(
-    "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline", MODELS_TO_TEST["gptq"]
-)
-def test_text_generation_gptq(
-    model_name: str,
-    baseline: float,
-    world_size: int,
-    batch_size: int,
-    reuse_cache: bool,
-    input_len: int,
-    output_len: int,
-    token: str,
-):
-    deepspeed = True if world_size > 1 else False
-    _test_text_generation(
-        model_name,
-        baseline,
-        token,
-        deepspeed=deepspeed,
-        world_size=world_size,
-        fp8=False,
-        gptq=True,
-        batch_size=batch_size,
-        reuse_cache=reuse_cache,
-        max_input_tokens=input_len,
-        max_output_tokens=output_len,
-    )
-
-
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["deepspeed"])
-def test_text_generation_deepspeed(model_name: str, baseline: float, token: str):
-    world_size = 2 if "opt-66b" in model_name else 8
-    _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size)
-
-
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["torch_compile"])
-def test_text_generation_torch_compile(model_name: str, baseline: float, token: str):
-    _test_text_generation(model_name, baseline, token, torch_compile=True)
-
-
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["torch_compile_distributed"])
-def test_text_generation_torch_compile_distributed(model_name: str, baseline: float, token: str):
-    world_size = 8
-    _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, torch_compile=True)
-
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["distributed_tp"])
-def test_text_generation_distributed_tp(model_name: str, baseline: float, token: str):
-    world_size = 8
-    _test_text_generation(
-        model_name,
-        baseline,
-        token,
-        batch_size=64,
-        max_input_tokens=128,
-        world_size=world_size,
-        torch_compile=True,
-        parallel_strategy="tp",
-    )
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
deleted file mode 100644
index 8be28e79c9..0000000000
--- a/tests/test_trainer.py
+++ /dev/null
@@ -1,2784 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-import json
-import math
-import os
-import random
-import re
-import subprocess
-import tempfile
-import unittest
-from functools import partial
-from itertools import product
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-from huggingface_hub import HfFolder, ModelCard, delete_repo, list_repo_commits, list_repo_files
-from parameterized import parameterized
-from pytest import mark
-from requests.exceptions import HTTPError
-from transformers import (
-    IntervalStrategy,
-    PretrainedConfig,
-    TrainerCallback,
-    get_polynomial_decay_schedule_with_warmup,
-    is_torch_available,
-)
-from transformers.hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS
-from transformers.testing_utils import (
-    ENDPOINT_STAGING,
-    TOKEN,
-    USER,
-    CaptureLogger,
-    LoggingLevel,
-    TestCasePlus,
-    get_gpu_count,
-    get_tests_dir,
-    is_staging_test,
-    require_accelerate,
-    require_optuna,
-    require_safetensors,
-    require_sentencepiece,
-    require_tensorboard,
-    require_tokenizers,
-    require_torch,
-)
-from transformers.trainer_pt_utils import AcceleratorConfig
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend
-from transformers.training_args import OptimizerNames
-from transformers.utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    is_accelerate_available,
-    is_safetensors_available,
-)
-from transformers.utils.hp_naming import TrialShortNamer
-
-from optimum.habana import GaudiConfig, GaudiTrainingArguments
-from optimum.utils import logging
-
-
-if is_torch_available():
-    import torch
-    import transformers.optimization
-    from torch import nn
-    from torch.utils.data import IterableDataset
-    from transformers import EarlyStoppingCallback, GPT2Config, PreTrainedModel, TrainerState
-    from transformers.modeling_utils import unwrap_model
-
-    from optimum.habana import GaudiTrainer
-    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-    from optimum.habana.transformers.models.gpt2 import GaudiGPT2LMHeadModel
-
-    if is_safetensors_available():
-        import safetensors.torch
-
-
-# for version specific tests in TrainerIntegrationTest
-require_accelerate_version_min_0_28 = partial(require_accelerate, min_version="0.28")
-GRAD_ACCUM_KWARGS_VERSION_AVAILABLE = is_accelerate_available("0.28")
-
-
-PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt"
-
-
-adapt_transformers_to_gaudi()
-
-
-class RegressionDataset:
-    def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
-        np.random.seed(seed)
-        self.label_names = ["labels"] if label_names is None else label_names
-        self.length = length
-        self.x = np.random.normal(size=(length,)).astype(np.float32)
-        self.ys = [a * self.x + b + np.random.normal(scale=0.1, size=(length,)) for _ in self.label_names]
-        self.ys = [y.astype(np.float32) for y in self.ys]
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, i):
-        result = {name: y[i] for name, y in zip(self.label_names, self.ys)}
-        result["input_x"] = self.x[i]
-        return result
-
-
-class RegressionDatasetDynamic:
-    def __init__(self, a=2, b=3, length=128, seed=42, label_names=None):
-        np.random.seed(seed)
-        self.label_names = ["labels"] if label_names is None else label_names
-        self.length = length
-        self.a = a
-        self.b = b
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, i):
-        self.x = np.random.normal(size=(self.length + i,)).astype(np.float32)
-        self.ys = self.a * self.x + self.b + np.random.normal(scale=0.1, size=(self.length + i,)).astype(np.float32)
-        result = {}
-        result["labels"] = self.ys
-        result["input_x"] = self.x
-        return result
-
-
-@dataclasses.dataclass
-class RegressionGaudiTrainingArguments(GaudiTrainingArguments):
-    a: float = 0.0
-    b: float = 0.0
-    keep_report_to: bool = False
-
-    def __post_init__(self):
-        super().__post_init__()
-        # save resources not dealing with reporting unless specified (also avoids the warning when it's not set)
-        # can be explicitly disabled via `keep_report_to`
-        if not self.keep_report_to:
-            self.report_to = []
-
-
-class RepeatDataset:
-    def __init__(self, x, length=64):
-        self.x = x
-        self.length = length
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, i):
-        return {"input_ids": self.x, "labels": self.x}
-
-
-class DynamicShapesDataset:
-    def __init__(self, length=64, seed=42, batch_size=8):
-        self.length = length
-        np.random.seed(seed)
-        sizes = np.random.randint(1, 20, (length // batch_size,))
-        # For easy batching, we make every batch_size consecutive samples the same size.
-        self.xs = [np.random.normal(size=(s,)).astype(np.float32) for s in sizes.repeat(batch_size)]
-        self.ys = [np.random.normal(size=(s,)).astype(np.float32) for s in sizes.repeat(batch_size)]
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, i):
-        return {"input_x": self.xs[i], "labels": self.ys[i]}
-
-
-class AlmostAccuracy:
-    def __init__(self, thresh=0.25):
-        self.thresh = thresh
-
-    def __call__(self, eval_pred):
-        predictions, labels = eval_pred
-        true = np.abs(predictions - labels) <= self.thresh
-        return {"accuracy": true.astype(np.float32).mean().item()}
-
-
-class RegressionModelConfig(PretrainedConfig):
-    def __init__(self, a=0, b=0, double_output=False, random_torch=True, **kwargs):
-        super().__init__(**kwargs)
-        self.a = a
-        self.b = b
-        self.double_output = double_output
-        self.random_torch = random_torch
-        self.hidden_size = 1
-
-
-if is_torch_available():
-
-    class SampleIterableDataset(IterableDataset):
-        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
-            self.dataset = RegressionDataset(a=a, b=b, length=length, seed=seed, label_names=label_names)
-
-        def __iter__(self):
-            for i in range(len(self.dataset)):
-                yield self.dataset[i]
-
-    class FiniteIterableDataset(SampleIterableDataset):
-        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
-            super().__init__(a, b, length, seed, label_names)
-            self.current_sample = 0
-
-        def __iter__(self):
-            while self.current_sample < len(self.dataset):
-                yield self.dataset[self.current_sample]
-                self.current_sample += 1
-
-    class MultiLoader:
-        def __init__(self, loaders):
-            self.loaders = loaders
-
-        def __len__(self):
-            return sum(len(loader) for loader in self.loaders)
-
-        def __iter__(self):
-            for loader in self.loaders:
-                yield from loader
-
-    class CustomDataloaderTrainer(GaudiTrainer):
-        def get_train_dataloader(self):
-            dataloaders = [super().get_train_dataloader(), super().get_train_dataloader()]
-            return MultiLoader(dataloaders)
-
-        def get_eval_dataloader(self, eval_dataset):
-            dataloaders = [super().get_eval_dataloader(eval_dataset), super().get_eval_dataloader(eval_dataset)]
-            return MultiLoader(dataloaders)
-
-    class RegressionModel(nn.Module):
-        def __init__(self, a=0, b=0, double_output=False):
-            super().__init__()
-            self.a = nn.Parameter(torch.tensor(a).float())
-            self.b = nn.Parameter(torch.tensor(b).float())
-            self.double_output = double_output
-            self.config = None
-
-        def forward(self, input_x, labels=None, **kwargs):
-            y = input_x * self.a + self.b
-            if labels is None:
-                return (y, y) if self.double_output else (y,)
-            loss = nn.functional.mse_loss(y, labels)
-            return (loss, y, y) if self.double_output else (loss, y)
-
-    class RegressionDictModel(nn.Module):
-        def __init__(self, a=0, b=0):
-            super().__init__()
-            self.a = nn.Parameter(torch.tensor(a).float())
-            self.b = nn.Parameter(torch.tensor(b).float())
-            self.config = None
-
-        def forward(self, input_x, labels=None, **kwargs):
-            y = input_x * self.a + self.b
-            result = {"output": y}
-            if labels is not None:
-                result["loss"] = nn.functional.mse_loss(y, labels)
-            return result
-
-    class RegressionPreTrainedModel(PreTrainedModel):
-        config_class = RegressionModelConfig
-        base_model_prefix = "regression"
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.a = nn.Parameter(torch.tensor(config.a).float(), requires_grad=True)
-            self.b = nn.Parameter(torch.tensor(config.b).float(), requires_grad=True)
-            self.double_output = config.double_output
-
-        def forward(self, input_x, labels=None, **kwargs):
-            y = input_x * self.a + self.b
-            if labels is None:
-                return (y, y) if self.double_output else (y,)
-            loss = nn.functional.mse_loss(y, labels)
-            return (loss, y, y) if self.double_output else (loss, y)
-
-    class RegressionPreTrainedModelWithGradientCheckpointing(PreTrainedModel):
-        config_class = RegressionModelConfig
-        base_model_prefix = "regression"
-        supports_gradient_checkpointing = True
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.layers = nn.ModuleList([nn.Linear(config.hidden_size, config.hidden_size) for _ in range(4)])
-            self.head = nn.Linear(config.hidden_size, 1)
-            self.gradient_checkpointing = False
-            self.double_output = config.double_output
-
-        def forward(self, input_x, labels=None, **kwargs):
-            y = input_x.unsqueeze(0)
-
-            for layer in self.layers:
-                if self.training and self.gradient_checkpointing:
-                    outputs = self._gradient_checkpointing_func(layer.__call__, y)
-                else:
-                    outputs = layer(y)
-
-                y = outputs * 3
-
-            logits = self.head(y)
-
-            if labels is None:
-                return (logits, logits) if self.double_output else (logits,)
-
-            loss = nn.functional.mse_loss(logits, labels)
-
-            return (loss, y, y) if self.double_output else (loss, y)
-
-    class RegressionRandomPreTrainedModel(PreTrainedModel):
-        config_class = RegressionModelConfig
-        base_model_prefix = "regression"
-
-        def __init__(self, config):
-            super().__init__(config)
-            self.a = nn.Parameter(torch.tensor(config.a).float())
-            self.b = nn.Parameter(torch.tensor(config.b).float())
-            self.random_torch = config.random_torch
-
-        def forward(self, input_x, labels=None, **kwargs):
-            y = input_x * self.a + self.b
-            if self.random_torch:
-                torch_rand = torch.randn(1).squeeze()
-            np_rand = np.random.rand()
-            rand_rand = random.random()
-
-            if self.random_torch:
-                y += 0.05 * torch_rand
-            y += 0.05 * torch.tensor(np_rand + rand_rand)
-
-            if labels is None:
-                return (y,)
-            loss = nn.functional.mse_loss(y, labels)
-            return (loss, y)
-
-    class TstLayer(nn.Module):
-        def __init__(self, hidden_size):
-            super().__init__()
-            self.linear1 = nn.Linear(hidden_size, hidden_size)
-            self.ln1 = nn.LayerNorm(hidden_size)
-            self.linear2 = nn.Linear(hidden_size, hidden_size)
-            self.ln2 = nn.LayerNorm(hidden_size)
-            self.bias = nn.Parameter(torch.zeros(hidden_size))
-
-        def forward(self, x):
-            h = self.ln1(nn.functional.relu(self.linear1(x)))
-            h = nn.functional.relu(self.linear2(x))
-            return self.ln2(x + h + self.bias)
-
-    def get_gaudi_config(gaudi_config_name_or_path: Optional[Union[str, Path]] = None) -> GaudiConfig:
-        if gaudi_config_name_or_path is None:
-            gaudi_config_name_or_path = Path(__file__).parent.resolve() / Path(
-                "configs/gaudi_config_trainer_test.json"
-            )
-        return GaudiConfig.from_pretrained(gaudi_config_name_or_path)
-
-    def get_regression_trainer(
-        a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, keep_report_to=False, **kwargs
-    ):
-        label_names = kwargs.get("label_names", None)
-        gradient_checkpointing = kwargs.get("gradient_checkpointing", False)
-        train_dataset = RegressionDataset(length=train_len, label_names=label_names)
-        eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
-
-        model_init = kwargs.pop("model_init", None)
-        if model_init is not None:
-            model = None
-        else:
-            if pretrained:
-                config = RegressionModelConfig(a=a, b=b, double_output=double_output)
-                # We infer the correct model class if one uses gradient_checkpointing or not
-                target_cls = (
-                    RegressionPreTrainedModel
-                    if not gradient_checkpointing
-                    else RegressionPreTrainedModelWithGradientCheckpointing
-                )
-                model = target_cls(config)
-            else:
-                model = RegressionModel(a=a, b=b, double_output=double_output)
-
-        gaudi_config = get_gaudi_config()
-
-        compute_metrics = kwargs.pop("compute_metrics", None)
-        data_collator = kwargs.pop("data_collator", None)
-        optimizers = kwargs.pop("optimizers", (None, None))
-        output_dir = kwargs.pop("output_dir", "./regression")
-        preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None)
-
-        args = RegressionGaudiTrainingArguments(
-            output_dir, use_habana=True, use_lazy_mode=True, a=a, b=b, keep_report_to=keep_report_to, **kwargs
-        )
-
-        return GaudiTrainer(
-            model,
-            gaudi_config,
-            args,
-            data_collator=data_collator,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            compute_metrics=compute_metrics,
-            optimizers=optimizers,
-            model_init=model_init,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
-        )
-
-
-class GaudiTrainerIntegrationCommon:
-    def check_saved_checkpoints(self, output_dir, freq, total, is_pretrained=True, safe_weights=True):
-        weights_file = WEIGHTS_NAME if not safe_weights else SAFE_WEIGHTS_NAME
-        file_list = [weights_file, "training_args.bin", "optimizer.pt", "scheduler.pt", "trainer_state.json"]
-        if is_pretrained:
-            file_list.append("config.json")
-            file_list.append("gaudi_config.json")
-        for step in range(freq, total, freq):
-            checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
-            self.assertTrue(os.path.isdir(checkpoint))
-            for filename in file_list:
-                self.assertTrue(os.path.isfile(os.path.join(checkpoint, filename)))
-
-    def check_best_model_has_been_loaded(
-        self, output_dir, freq, total, trainer, metric, greater_is_better=False, is_pretrained=True, safe_weights=True
-    ):
-        checkpoint = os.path.join(output_dir, f"checkpoint-{(total // freq) * freq}")
-        log_history = TrainerState.load_from_json(os.path.join(checkpoint, "trainer_state.json")).log_history
-
-        values = [d[metric] for d in log_history]
-        best_value = max(values) if greater_is_better else min(values)
-        best_checkpoint = (values.index(best_value) + 1) * freq
-        checkpoint = os.path.join(output_dir, f"checkpoint-{best_checkpoint}")
-        if is_pretrained:
-            best_model = RegressionPreTrainedModel.from_pretrained(checkpoint)
-            best_model.to(trainer.args.device)
-        else:
-            best_model = RegressionModel()
-            if not safe_weights:
-                state_dict = torch.load(os.path.join(checkpoint, WEIGHTS_NAME))
-            else:
-                state_dict = safetensors.torch.load_file(os.path.join(checkpoint, SAFE_WEIGHTS_NAME))
-            best_model.load_state_dict(state_dict)
-            best_model.to(trainer.args.device)
-        self.assertTrue(torch.allclose(best_model.a, trainer.model.a))
-        self.assertTrue(torch.allclose(best_model.b, trainer.model.b))
-
-        metrics = trainer.evaluate()
-        self.assertEqual(metrics[metric], best_value)
-
-    def check_trainer_state_are_the_same(self, trainer_state, trainer_state1):
-        # We'll pop things so operate on copies.
-        state = trainer_state.copy()
-        state1 = trainer_state1.copy()
-        # Log history main contain different logs for the time metrics (after resuming a training).
-        log_history = state.pop("log_history", None)
-        log_history1 = state1.pop("log_history", None)
-        self.assertEqual(state, state1)
-        skip_log_keys = ["train_runtime", "train_samples_per_second", "train_steps_per_second", "train_loss"]
-        for log, log1 in zip(log_history, log_history1):
-            for key in skip_log_keys:
-                _ = log.pop(key, None)
-                _ = log1.pop(key, None)
-            self.assertEqual(log, log1)
-
-    def convert_to_sharded_checkpoint(self, folder, save_safe=True, load_safe=True):
-        # Converts a checkpoint of a regression model to a sharded checkpoint.
-        if load_safe:
-            loader = safetensors.torch.load_file
-            weights_file = os.path.join(folder, SAFE_WEIGHTS_NAME)
-        else:
-            loader = torch.load
-            weights_file = os.path.join(folder, WEIGHTS_NAME)
-
-        if save_safe:
-            extension = "safetensors"
-            saver = safetensors.torch.save_file
-            index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME)
-            shard_name = SAFE_WEIGHTS_NAME
-        else:
-            extension = "bin"
-            saver = torch.save
-            index_file = os.path.join(folder, WEIGHTS_INDEX_NAME)
-            shard_name = WEIGHTS_NAME
-
-        state_dict = loader(weights_file)
-
-        os.remove(weights_file)
-        keys = list(state_dict.keys())
-
-        shard_files = [
-            shard_name.replace(f".{extension}", f"-{idx+1:05d}-of-{len(keys):05d}.{extension}")
-            for idx in range(len(keys))
-        ]
-        index = {"metadata": {}, "weight_map": {key: shard_files[i] for i, key in enumerate(keys)}}
-
-        with open(index_file, "w", encoding="utf-8") as f:
-            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-            f.write(content)
-
-        for param_name, shard_file in zip(keys, shard_files):
-            saver({param_name: state_dict[param_name]}, os.path.join(folder, shard_file))
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class GaudiTrainerIntegrationPrerunTest(TestCasePlus, GaudiTrainerIntegrationCommon):
-    """
-    Only tests that want to tap into the auto-pre-run 2 trainings:
-    - self.default_trained_model
-    - self.alternate_trained_model
-    directly, or via check_trained_model
-    """
-
-    def setUp(self):
-        super().setUp()
-        args = GaudiTrainingArguments("..", use_habana=True, use_lazy_mode=True)
-        self.n_epochs = args.num_train_epochs
-        self.batch_size = args.train_batch_size
-        trainer = get_regression_trainer(learning_rate=0.1)
-        trainer.train()
-        self.default_trained_model = (trainer.model.a, trainer.model.b)
-
-        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
-        trainer.train()
-        self.alternate_trained_model = (trainer.model.a, trainer.model.b)
-
-    def check_trained_model(self, model, alternate_seed=False, bf16=False):
-        # Checks a training seeded with learning_rate = 0.1
-        (a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
-        if not bf16:
-            self.assertTrue(torch.allclose(model.a, a))
-            self.assertTrue(torch.allclose(model.b, b))
-        else:
-            self.assertTrue(torch.allclose(model.a, a, atol=1e-03, rtol=0))
-            self.assertTrue(torch.allclose(model.b, b, atol=1e-03, rtol=0))
-
-    def test_reproducible_training(self):
-        # Checks that training worked, model trained and seed made a reproducible training.
-        trainer = get_regression_trainer(learning_rate=0.1)
-        trainer.train()
-        self.check_trained_model(trainer.model)
-
-        # Checks that a different seed gets different (reproducible) results.
-        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
-        trainer.train()
-        self.check_trained_model(trainer.model, alternate_seed=True)
-
-    def test_trainer_with_datasets(self):
-        import datasets
-
-        np.random.seed(42)
-        x = np.random.normal(size=(64,)).astype(np.float32)
-        y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,)).astype(np.float32)
-        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})
-
-        gaudi_config = get_gaudi_config()
-
-        # Base training. Should have the same results as test_reproducible_training
-        model = RegressionModel()
-        args = GaudiTrainingArguments("./regression", learning_rate=0.1, use_habana=True, use_lazy_mode=True)
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
-
-        # Can return tensors.
-        train_dataset.set_format(type="torch", dtype=torch.float32)
-        model = RegressionModel()
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
-
-        # Adding one column not used by the model should have no impact
-        z = np.random.normal(size=(64,)).astype(np.float32)
-        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
-        model = RegressionModel()
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        self.check_trained_model(trainer.model)
-
-    def test_model_init(self):
-        train_dataset = RegressionDataset()
-        gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments("./regression", learning_rate=0.1, use_habana=True, use_lazy_mode=True)
-        trainer = GaudiTrainer(
-            gaudi_config=gaudi_config, args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel()
-        )
-        trainer.train()
-        self.check_trained_model(trainer.model)
-
-        # Re-training should restart from scratch, thus lead the same results.
-        trainer.train()
-        self.check_trained_model(trainer.model)
-
-        # Re-training should restart from scratch, thus lead the same results and new seed should be used.
-        trainer.args.seed = 314
-        trainer.train()
-        self.check_trained_model(trainer.model, alternate_seed=True)
-
-    def test_gradient_accumulation(self):
-        # Training with half the batch size but accumulation steps as 2 should give the same results.
-        trainer = get_regression_trainer(
-            gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1
-        )
-        trainer.train()
-        self.check_trained_model(trainer.model)
-
-    # The test below is commented because it leads to a core dumped error
-    # when it is run with all other tests. It passes when run alone.
-    # It seems to be cause by setting `use_reentrant` to False in
-    # gradient checkpointing.
-    # def test_gradient_checkpointing(self):
-    #     trainer = get_regression_trainer(
-    #         per_device_train_batch_size=1,
-    #         learning_rate=0.1,
-    #         gradient_checkpointing=True,
-    #         gradient_checkpointing_kwargs={"use_reentrant": False},
-    #     )
-    #     previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()}
-
-    #     trainer.train()
-
-    #     # Check if model weights have been updated
-    #     for k, v in trainer.model.named_parameters():
-    #         self.assertFalse(
-    #             torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4),
-    #             f"Model weights for {k} have not been updated",
-    #         )
-
-    def test_training_loss(self):
-        n_gpus = max(1, get_gpu_count())
-
-        # With even logs
-        trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus))
-        trainer.train()
-        log_history = trainer.state.log_history
-
-        losses = [log["loss"] for log in log_history if "loss" in log]
-        train_loss = log_history[-1]["train_loss"]
-        self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4)
-
-        # With uneven logs
-        trainer = get_regression_trainer(logging_steps=5)
-        trainer.train()
-        log_history = trainer.state.log_history
-
-        # Training loss should be the same as before
-        new_train_loss = log_history[-1]["train_loss"]
-        self.assertAlmostEqual(train_loss, new_train_loss, places=4)
-
-    def test_custom_optimizer(self):
-        train_dataset = RegressionDataset()
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_fused_adam = False
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True)
-        model = RegressionModel()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
-        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
-        trainer = GaudiTrainer(
-            model, gaudi_config, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)
-        )
-        trainer.train()
-
-        (a, b) = self.default_trained_model
-        self.assertFalse(torch.allclose(trainer.model.a, a))
-        self.assertFalse(torch.allclose(trainer.model.b, b))
-        self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
-
-    def test_lr_scheduler_kwargs(self):
-        # test scheduler kwargs passed via TrainingArguments
-        train_dataset = RegressionDataset()
-        model = RegressionModel()
-        num_steps, num_warmup_steps = 10, 2
-        extra_kwargs = {"power": 5.0, "lr_end": 1e-5}  # Non-default arguments
-        args = GaudiTrainingArguments(
-            "./regression",
-            lr_scheduler_type="polynomial",
-            lr_scheduler_kwargs=extra_kwargs,
-            learning_rate=0.2,
-            warmup_steps=num_warmup_steps,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
-
-        # Checking that the scheduler was created
-        self.assertIsNotNone(trainer.lr_scheduler)
-
-        # Checking that the correct args were passed
-        sched1 = trainer.lr_scheduler
-        sched2 = get_polynomial_decay_schedule_with_warmup(
-            trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs
-        )
-        self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args)
-        self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords)
-
-    def test_cosine_with_min_lr_scheduler(self):
-        train_dataset = RegressionDataset()
-        model = RegressionModel()
-        num_steps, num_warmup_steps = 10, 2
-        extra_kwargs = {"min_lr": 1e-5}  # Non-default arguments
-        args = GaudiTrainingArguments(
-            "./regression",
-            lr_scheduler_type="cosine_with_min_lr",
-            lr_scheduler_kwargs=extra_kwargs,
-            learning_rate=0.2,
-            warmup_steps=num_warmup_steps,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        trainer = GaudiTrainer(model, gaudi_config=get_gaudi_config(), args=args, train_dataset=train_dataset)
-        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
-
-        # Checking that the scheduler was created
-        self.assertIsNotNone(trainer.lr_scheduler)
-
-        # Check the last learning rate
-        for _ in range(num_steps):
-            trainer.lr_scheduler.step()
-        self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5)
-
-    def test_reduce_lr_on_plateau_args(self):
-        # test passed arguments for a custom ReduceLROnPlateau scheduler
-        train_dataset = RegressionDataset(length=64)
-        eval_dataset = RegressionDataset(length=64)
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_fused_adam = False
-        args = GaudiTrainingArguments(
-            "./regression",
-            evaluation_strategy="epoch",
-            metric_for_best_model="eval_loss",
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        model = RegressionModel()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
-        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2)
-        trainer = GaudiTrainer(
-            model,
-            gaudi_config,
-            args,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            optimizers=(optimizer, lr_scheduler),
-        )
-        trainer.train()
-
-        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
-        self.assertEqual(trainer.lr_scheduler.factor, 0.2)
-        self.assertEqual(trainer.lr_scheduler.patience, 5)
-        self.assertEqual(trainer.lr_scheduler.cooldown, 2)
-
-    def test_reduce_lr_on_plateau(self):
-        # test the ReduceLROnPlateau scheduler
-
-        class TrainerWithLRLogs(GaudiTrainer):
-            def log(self, logs):
-                # the LR is computed after metrics and does not exist for the first epoch
-                if hasattr(self.lr_scheduler, "_last_lr"):
-                    logs["learning_rate"] = self.lr_scheduler._last_lr[0]
-                super().log(logs)
-
-        train_dataset = RegressionDataset(length=64)
-        eval_dataset = RegressionDataset(length=64)
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_fused_adam = False
-
-        args = GaudiTrainingArguments(
-            "./regression",
-            lr_scheduler_type="reduce_lr_on_plateau",
-            evaluation_strategy="epoch",
-            metric_for_best_model="eval_loss",
-            num_train_epochs=10,
-            learning_rate=0.2,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        model = RegressionModel()
-        trainer = TrainerWithLRLogs(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-
-        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
-        patience = trainer.lr_scheduler.patience
-
-        logs = trainer.state.log_history[1:]
-        best_loss = logs[0]["eval_loss"]
-        bad_epochs = 0
-        for i, log in enumerate(logs[:-1]):  # Compare learning rate to next epoch's
-            loss = log["eval_loss"]
-            just_decreased = False
-            if loss > best_loss:
-                bad_epochs += 1
-                if bad_epochs > patience:
-                    self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"])
-                    just_decreased = True
-                    bad_epochs = 0
-            else:
-                best_loss = loss
-                bad_epochs = 0
-            if not just_decreased:
-                self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"])
-
-    def test_adafactor_lr_none(self):
-        # test the special case where lr=None, since Trainer can't not have lr_scheduler
-
-        from transformers.optimization import Adafactor, AdafactorSchedule
-
-        train_dataset = RegressionDataset()
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True)
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_fused_adam = False
-        model = RegressionModel().to("hpu")
-        optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
-        lr_scheduler = AdafactorSchedule(optimizer)
-        trainer = GaudiTrainer(
-            model, gaudi_config, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)
-        )
-        trainer.train()
-
-        (a, b) = self.default_trained_model
-        self.assertFalse(torch.allclose(trainer.model.a, a))
-        self.assertFalse(torch.allclose(trainer.model.b, b))
-        self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
-
-    def test_mixed_bf16(self):
-        # very basic test
-        trainer = get_regression_trainer(learning_rate=0.1, bf16=True)
-        self.assertTrue(trainer.use_hpu_amp)
-        trainer.train()
-        self.check_trained_model(trainer.model, bf16=True)
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class GaudiTrainerIntegrationTest(TestCasePlus, GaudiTrainerIntegrationCommon):
-    def setUp(self):
-        super().setUp()
-        args = GaudiTrainingArguments("..", use_habana=True, use_lazy_mode=True)
-        self.n_epochs = args.num_train_epochs
-        self.batch_size = args.train_batch_size
-
-    @mark.skip("Skip this test until PT_HPU_LAZY_MODE=0 is set as default for all tests")
-    def test_eager_mode(self):
-        train_dataset = RegressionDataset()
-        eval_dataset = RegressionDataset()
-        model = RegressionModel()
-        gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=False)
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-        _ = trainer.evaluate()
-        _ = trainer.predict(eval_dataset)
-
-    def test_hpu_graphs(self):
-        train_dataset = RegressionDataset()
-        eval_dataset = RegressionDataset()
-        model = RegressionModel()
-        gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments(
-            "./regression",
-            use_habana=True,
-            use_lazy_mode=True,
-            use_hpu_graphs_for_training=True,
-            use_hpu_graphs_for_inference=True,
-            disable_tensor_cache_hpu_graphs=True,
-            max_hpu_graphs=1,
-        )
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-        _ = trainer.evaluate()
-        _ = trainer.predict(eval_dataset)
-
-    def test_trainer_works_with_dict(self):
-        train_dataset = RegressionDataset()
-        eval_dataset = RegressionDataset()
-        model = RegressionDictModel()
-        gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True)
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train()
-        _ = trainer.evaluate()
-        _ = trainer.predict(eval_dataset)
-
-    def test_evaluation_with_keys_to_drop(self):
-        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
-        tiny_gpt2 = GaudiGPT2LMHeadModel(config)
-        x = torch.randint(0, 100, (128,))
-        eval_dataset = RepeatDataset(x)
-        args = GaudiTrainingArguments("./test", use_habana=True, use_lazy_mode=True)
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, eval_dataset=eval_dataset)
-        # By default the past_key_values are removed
-        result = trainer.predict(eval_dataset)
-        self.assertTrue(isinstance(result.predictions, np.ndarray))
-        # We can still get them by setting ignore_keys to []
-        result = trainer.predict(eval_dataset, ignore_keys=[])
-        self.assertTrue(isinstance(result.predictions, tuple))
-        self.assertEqual(len(result.predictions), 2)
-
-    def test_training_arguments_are_left_untouched(self):
-        trainer = get_regression_trainer()
-        trainer.train()
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True, report_to=[])
-        dict1, dict2 = args.to_dict(), trainer.args.to_dict()
-        for key in dict1.keys():
-            # Logging dir can be slightly different as they default to something with the time.
-            if key != "logging_dir":
-                self.assertEqual(dict1[key], dict2[key])
-
-    def test_number_of_steps_in_training(self):
-        # Regular training has n_epochs * len(train_dl) steps
-        trainer = get_regression_trainer(learning_rate=0.1)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
-
-        # Check passing num_train_epochs works (and a float version too):
-        trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))
-
-        # If we pass a max_steps, num_train_epochs is ignored
-        trainer = get_regression_trainer(learning_rate=0.1, max_steps=10)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, 10)
-
-    # TODO: investigate why this test fails
-    # def test_neftune(self):
-    #     config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
-    #     tiny_gpt2 = GPT2LMHeadModel(config)
-    #     x = torch.randint(0, 100, (128,))
-    #     train_dataset = RepeatDataset(x)
-
-    #     # Trainer without inf/nan filter
-    #     args = GaudiTrainingArguments(
-    #         "./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4, use_habana=True, use_lazy_mode=True,
-    #     )
-    #     gaudi_config = get_gaudi_config()
-    #     trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset)
-
-    #     trainer.model = trainer._activate_neftune(trainer.model)
-
-    #     dummy_input = torch.LongTensor([[1, 0, 1]]).to("hpu")
-
-    #     emb1 = trainer.model.get_input_embeddings()(dummy_input)
-    #     emb2 = trainer.model.get_input_embeddings()(dummy_input)
-
-    #     self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!")
-
-    #     # redefine the model
-    #     tiny_gpt2 = GPT2LMHeadModel(config)
-    #     # Trainer without inf/nan filter
-    #     args = GaudiTrainingArguments(
-    #         "./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4, use_habana=True, use_lazy_mode=True,
-    #     )
-    #     trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset)
-
-    #     # Check that it trains without errors
-    #     trainer.train()
-
-    #     # Make sure forward pass works fine
-    #     _ = trainer.model(dummy_input)
-    #     self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0)
-
-    #     trainer.model.eval()
-
-    #     # Check that we get identical embeddings just in case
-    #     emb1 = trainer.model.get_input_embeddings()(dummy_input)
-    #     emb2 = trainer.model.get_input_embeddings()(dummy_input)
-
-    #     self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!")
-
-    def test_logging_inf_nan_filter(self):
-        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
-        tiny_gpt2 = GaudiGPT2LMHeadModel(config)
-        x = torch.randint(0, 100, (128,))
-        train_dataset = RepeatDataset(x)
-
-        # GaudiTrainer without inf/nan filter
-        gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments(
-            "./test",
-            learning_rate=1e9,
-            logging_steps=5,
-            logging_nan_inf_filter=False,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        log_history_no_filter = trainer.state.log_history
-
-        # GaudiTrainer with inf/nan filter
-        args = GaudiTrainingArguments(
-            "./test",
-            learning_rate=1e9,
-            logging_steps=5,
-            logging_nan_inf_filter=True,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        trainer = GaudiTrainer(tiny_gpt2, gaudi_config, args, train_dataset=train_dataset)
-        trainer.train()
-        log_history_filter = trainer.state.log_history
-
-        def is_any_loss_nan_or_inf(log_history):
-            losses = [l["loss"] for l in log_history[:-1]]
-            return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses)
-
-        self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter))
-        self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))
-
-    def test_train_and_eval_dataloaders(self):
-        trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16)
-        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
-        trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16)
-        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
-
-        # Check drop_last works
-        trainer = get_regression_trainer(
-            train_len=66, eval_len=74, learning_rate=0.1, per_device_train_batch_size=16, per_device_eval_batch_size=32
-        )
-        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16) + 1)
-        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32) + 1)
-
-        trainer = get_regression_trainer(
-            train_len=66,
-            eval_len=74,
-            learning_rate=0.1,
-            per_device_train_batch_size=16,
-            per_device_eval_batch_size=32,
-            dataloader_drop_last=True,
-        )
-        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16))
-        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32))
-
-        # Check passing a new dataset for evaluation works
-        new_eval_dataset = RegressionDataset(length=128)
-        self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32))
-
-    # tests that we do not require dataloader to have a .dataset attribute
-    def test_dataloader_without_dataset(self):
-        train_dataset = RegressionDataset(length=128)
-        args = GaudiTrainingArguments(output_dir="tmp_trainer", use_habana=True, use_lazy_mode=True)
-        trainer = CustomDataloaderTrainer(
-            model=RegressionModel(),
-            gaudi_config=get_gaudi_config(),
-            args=args,
-            train_dataset=train_dataset,
-            eval_dataset=train_dataset,
-        )
-        trainer.train()
-        trainer.evaluate()
-
-    def test_data_is_not_parallelized_when_model_is_parallel(self):
-        model = RegressionModel()
-        # Make the Trainer believe it's a parallelized model
-        model.is_parallelizable = True
-        model.model_parallel = True
-        args = GaudiTrainingArguments(
-            "./regression",
-            per_device_train_batch_size=16,
-            per_device_eval_batch_size=16,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(
-            model, gaudi_config, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()
-        )
-        # Check the Trainer was fooled
-        self.assertTrue(trainer.is_model_parallel)
-        self.assertEqual(trainer.args.n_gpu, 1)
-
-        # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
-        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
-        self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
-        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
-        self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
-
-    def test_evaluate(self):
-        trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy())
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy())
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With logits preprocess
-        trainer = get_regression_trainer(
-            a=1.5,
-            b=2.5,
-            compute_metrics=AlmostAccuracy(),
-            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
-        )
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-    def test_predict(self):
-        trainer = get_regression_trainer(a=1.5, b=2.5)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With a number of elements not a round multiple of the batch size
-        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With more than one output of the model
-        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True)
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-
-        # With more than one output/label of the model
-        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"])
-        outputs = trainer.predict(trainer.eval_dataset)
-        preds = outputs.predictions
-        labels = outputs.label_ids
-        x = trainer.eval_dataset.x
-        self.assertEqual(len(preds), 2)
-        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
-        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
-        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
-        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
-
-    def test_dynamic_shapes(self):
-        eval_dataset = DynamicShapesDataset(batch_size=self.batch_size)
-        model = RegressionModel(a=2, b=1)
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True)
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_dynamic_shapes = True
-        trainer = GaudiTrainer(model, gaudi_config, args, eval_dataset=eval_dataset)
-
-        # Check evaluation can run to completion
-        _ = trainer.evaluate()
-
-        # Check predictions
-        preds = trainer.predict(eval_dataset)
-        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
-            self.assertTrue(np.allclose(expected, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
-
-        for expected, seen in zip(eval_dataset.xs, preds.predictions):
-            self.assertTrue(np.allclose(2 * expected + 1, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
-
-        # Same tests with eval accumulation
-        args = GaudiTrainingArguments("./regression", use_habana=True, use_lazy_mode=True, eval_accumulation_steps=2)
-        trainer = GaudiTrainer(model, gaudi_config, args, eval_dataset=eval_dataset)
-
-        # Check evaluation can run to completion
-        _ = trainer.evaluate()
-
-        # Check predictions
-        preds = trainer.predict(eval_dataset)
-        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
-            self.assertTrue(np.allclose(expected, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
-
-        for expected, seen in zip(eval_dataset.xs, preds.predictions):
-            self.assertTrue(np.allclose(2 * expected + 1, seen[: expected.shape[0]]))
-            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
-
-    def test_dynamic_shape_feature(self):
-        # Run training with variable length inputs and enable dynamic shapes support
-        train_dataset = RegressionDatasetDynamic(length=256)
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_dynamic_shapes = True
-        args = GaudiTrainingArguments(
-            "./regression", use_habana=True, use_lazy_mode=True, per_device_train_batch_size=1, num_train_epochs=1
-        )
-        model = RegressionModel()
-        trainer = GaudiTrainer(
-            model,
-            gaudi_config,
-            args,
-            train_dataset=train_dataset,
-        )
-        train_output_ds = trainer.train()
-
-        # Run training again with variable length inputs and disable dynamic shapes support
-        train_dataset = RegressionDatasetDynamic(length=256)
-        gaudi_config = get_gaudi_config()
-        gaudi_config.use_dynamic_shapes = False
-        args = GaudiTrainingArguments(
-            "./regression", use_habana=True, use_lazy_mode=True, per_device_train_batch_size=1, num_train_epochs=1
-        )
-        model = RegressionModel()
-        trainer = GaudiTrainer(
-            model,
-            gaudi_config,
-            args,
-            train_dataset=train_dataset,
-        )
-        train_output_static = trainer.train()
-
-        # Check if performance with dynamic shapes support is at least 5 times that without dynamic shapes
-        # Note "5x" number is not applicable across models, it is tuned for this particular dummy model
-        self.assertGreaterEqual(
-            train_output_ds.metrics["train_samples_per_second"],
-            5 * train_output_static.metrics["train_samples_per_second"],
-        )
-
-    def test_log_level(self):
-        # testing only --log_level (--log_level_replica requires multiple gpus and DDP and is tested elsewhere)
-        logger = logging.get_logger()
-        log_info_string = "Running training"
-
-        # test with the default log_level - should be the same as before and thus we test depending on is_info
-        is_info = logging.get_verbosity() <= 20
-        with CaptureLogger(logger) as cl:
-            trainer = get_regression_trainer()
-            trainer.train()
-        if is_info:
-            self.assertIn(log_info_string, cl.out)
-        else:
-            self.assertNotIn(log_info_string, cl.out)
-
-        with LoggingLevel(logging.INFO):
-            # test with low log_level - lower than info
-            with CaptureLogger(logger) as cl:
-                trainer = get_regression_trainer(log_level="debug")
-                trainer.train()
-            self.assertIn(log_info_string, cl.out)
-
-        with LoggingLevel(logging.INFO):
-            # test with high log_level - should be quiet
-            with CaptureLogger(logger) as cl:
-                trainer = get_regression_trainer(log_level="error")
-                trainer.train()
-            self.assertNotIn(log_info_string, cl.out)
-
-    def test_save_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size))
-
-        # With a regular model that is not a PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, pretrained=False)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
-
-    @require_safetensors
-    def test_safe_checkpoints(self):
-        for save_safetensors in [True, False]:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, save_safetensors=save_safetensors)
-                trainer.train()
-                self.check_saved_checkpoints(
-                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
-                )
-
-            # With a regular model that is not a PreTrainedModel
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = get_regression_trainer(
-                    output_dir=tmpdir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
-                )
-                trainer.train()
-                self.check_saved_checkpoints(
-                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
-                )
-
-    def test_can_resume_training(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            kwargs = {
-                "output_dir": tmpdir,
-                "train_len": 128,
-                "save_steps": 5,
-                "learning_rate": 0.1,
-                "logging_steps": 5,
-            }
-            trainer = get_regression_trainer(**kwargs)
-            # Disable FusedClipNorm because it makes the test fail
-            trainer.gaudi_config.use_fused_clip_norm = False
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
-
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
-
-            # Reinitialize trainer
-            trainer = get_regression_trainer(**kwargs)
-            # Disable FusedClipNorm because it makes the test fail
-            trainer.gaudi_config.use_fused_clip_norm = False
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-            # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(tmpdir, "checkpoint-15")
-
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
-            # Disable FusedClipNorm because it makes the test fail
-            trainer.gaudi_config.use_fused_clip_norm = False
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-        # With a regular model that is not a PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
-            kwargs = {
-                "output_dir": tmpdir,
-                "train_len": 128,
-                "save_steps": 5,
-                "learning_rate": 0.1,
-                "pretrained": False,
-            }
-
-            trainer = get_regression_trainer(**kwargs)
-            # Disable FusedClipNorm because it makes the test fail
-            trainer.gaudi_config.use_fused_clip_norm = False
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
-
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
-
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
-            # Disable FusedClipNorm because it makes the test fail
-            trainer.gaudi_config.use_fused_clip_norm = False
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-            # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(tmpdir, "checkpoint-15")
-
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
-            # Disable FusedClipNorm because it makes the test fail
-            trainer.gaudi_config.use_fused_clip_norm = False
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-        # Now check failures
-
-        # 1. fail to find a bogus checkpoint
-        trainer = get_regression_trainer()
-        with self.assertRaises(Exception) as context:
-            trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
-        self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
-
-        # 2. fail to find any checkpoint - due a fresh output_dir
-        output_dir2 = self.get_auto_remove_tmp_dir()
-        trainer = get_regression_trainer(output_dir=output_dir2)
-        with self.assertRaises(Exception) as context:
-            trainer.train(resume_from_checkpoint=True)
-        self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
-
-    def test_resume_training_with_randomness(self):
-        train_dataset = RegressionDataset(length=128)
-        eval_dataset = RegressionDataset()
-
-        config = RegressionModelConfig(a=0, b=2)
-        model = RegressionRandomPreTrainedModel(config)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        args = RegressionGaudiTrainingArguments(
-            tmp_dir, save_steps=5, learning_rate=0.1, use_habana=True, use_lazy_mode=True
-        )
-        gaudi_config = get_gaudi_config()
-        # Disable FusedClipNorm because it makes the test fail
-        gaudi_config.use_fused_clip_norm = False
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-
-        trainer.train()
-        (a, b) = trainer.model.a.item(), trainer.model.b.item()
-
-        model = RegressionRandomPreTrainedModel(config)
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-        trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, "checkpoint-15"))
-        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-
-        self.assertAlmostEqual(a, a1, delta=1e-5)
-        self.assertAlmostEqual(b, b1, delta=1e-5)
-
-    def test_auto_batch_size_with_resume_from_checkpoint(self):
-        train_dataset = RegressionDataset(length=128)
-
-        config = RegressionModelConfig(a=0, b=2)
-        model = RegressionRandomPreTrainedModel(config)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-
-        class MockCudaOOMCallback(TrainerCallback):
-            def on_step_end(self, args, state, control, **kwargs):
-                # simulate OOM on the first step
-                if state.train_batch_size >= 16:
-                    raise RuntimeError("CUDA out of memory.")
-
-        args = RegressionGaudiTrainingArguments(
-            tmp_dir,
-            do_train=True,
-            max_steps=2,
-            save_steps=1,
-            per_device_train_batch_size=16,
-            auto_find_batch_size=True,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(
-            model, gaudi_config, args, train_dataset=train_dataset, callbacks=[MockCudaOOMCallback()]
-        )
-        trainer.train()
-        # After `auto_find_batch_size` is ran we should now be at 8
-        self.assertEqual(trainer._train_batch_size, 8)
-
-        # We can then make a new Trainer
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset)
-        # Check we are at 16 to start
-        self.assertEqual(trainer._train_batch_size, 16 * max(trainer.args.n_gpu, 1))
-        trainer.train(resume_from_checkpoint=True)
-        # We should be back to 8 again, picking up based upon the last ran Trainer
-        self.assertEqual(trainer._train_batch_size, 8)
-
-    # regression for this issue: https://github.com/huggingface/transformers/issues/12970
-    def test_training_with_resume_from_checkpoint_false(self):
-        train_dataset = RegressionDataset(length=128)
-        eval_dataset = RegressionDataset()
-
-        config = RegressionModelConfig(a=0, b=2)
-        model = RegressionRandomPreTrainedModel(config)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        args = RegressionGaudiTrainingArguments(
-            tmp_dir, save_steps=5, learning_rate=0.1, use_habana=True, use_lazy_mode=True
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(model, gaudi_config, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-
-        trainer.train(resume_from_checkpoint=False)
-
-    @require_safetensors
-    def test_resume_training_with_safe_checkpoint(self):
-        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
-        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
-        # won't be the same since the training dataloader is shuffled).
-
-        for initial_safe in [False, True]:
-            for loaded_safe in [False, True]:
-                with tempfile.TemporaryDirectory() as tmpdir:
-                    trainer = get_regression_trainer(
-                        output_dir=tmpdir,
-                        train_len=128,
-                        save_steps=5,
-                        learning_rate=0.1,
-                        save_safetensors=initial_safe,
-                    )
-                    trainer.train()
-                    (a, b) = trainer.model.a.item(), trainer.model.b.item()
-                    state = dataclasses.asdict(trainer.state)
-
-                    checkpoint = os.path.join(tmpdir, "checkpoint-5")
-                    self.convert_to_sharded_checkpoint(checkpoint, load_safe=initial_safe, save_safe=loaded_safe)
-
-                    # Reinitialize trainer
-                    trainer = get_regression_trainer(
-                        output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, save_safetensors=loaded_safe
-                    )
-
-                    trainer.train(resume_from_checkpoint=checkpoint)
-                    (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-                    state1 = dataclasses.asdict(trainer.state)
-                    self.assertEqual(a, a1)
-                    self.assertEqual(b, b1)
-                    self.check_trainer_state_are_the_same(state, state1)
-
-    def test_resume_training_with_gradient_accumulation(self):
-        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
-        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
-        # won't be the same since the training dataloader is shuffled).
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                train_len=128,
-                gradient_accumulation_steps=2,
-                per_device_train_batch_size=4,
-                save_steps=5,
-                learning_rate=0.1,
-            )
-            # Disable FusedClipNorm because it makes this test fail
-            # TODO: investigate why
-            trainer.gaudi_config.use_fused_clip_norm = False
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
-
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
-
-            # Reinitialize trainer
-            trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                train_len=128,
-                gradient_accumulation_steps=2,
-                per_device_train_batch_size=4,
-                save_steps=5,
-                learning_rate=0.1,
-            )
-            # Disable FusedClipNorm because it makes this test fail
-            # TODO: investigate why
-            trainer.gaudi_config.use_fused_clip_norm = False
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-    def test_resume_training_with_frozen_params(self):
-        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
-        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
-        # won't be the same since the training dataloader is shuffled).
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                train_len=128,
-                per_device_train_batch_size=4,
-                save_steps=5,
-                learning_rate=0.1,
-            )
-            trainer.model.a.requires_grad_(False)
-            # Disable FusedClipNorm because it makes this test fail
-            # TODO: investigate why
-            trainer.gaudi_config.use_fused_clip_norm = False
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
-
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
-
-            # Reinitialize trainer
-            trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                train_len=128,
-                per_device_train_batch_size=4,
-                save_steps=5,
-                learning_rate=0.1,
-            )
-            trainer.model.a.requires_grad_(False)
-            # Disable FusedClipNorm because it makes this test fail
-            # TODO: investigate why
-            trainer.gaudi_config.use_fused_clip_norm = False
-            trainer.train(resume_from_checkpoint=checkpoint)
-            self.assertFalse(trainer.model.a.requires_grad)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-    def test_load_best_model_at_end(self):
-        total = int(self.n_epochs * 64 / self.batch_size)
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(
-                a=1.5,
-                b=2.5,
-                output_dir=tmpdir,
-                learning_rate=0.1,
-                eval_steps=5,
-                evaluation_strategy="steps",
-                save_steps=5,
-                load_best_model_at_end=True,
-            )
-            self.assertFalse(trainer.args.greater_is_better)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, total)
-            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss")
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(
-                a=1.5,
-                b=2.5,
-                output_dir=tmpdir,
-                learning_rate=0.1,
-                eval_steps=5,
-                evaluation_strategy="steps",
-                save_steps=5,
-                load_best_model_at_end=True,
-                metric_for_best_model="accuracy",
-                compute_metrics=AlmostAccuracy(),
-            )
-            self.assertTrue(trainer.args.greater_is_better)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, total)
-            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_accuracy", greater_is_better=True)
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(
-                a=1.5,
-                b=2.5,
-                output_dir=tmpdir,
-                learning_rate=0.1,
-                evaluation_strategy="epoch",
-                save_strategy="epoch",
-                load_best_model_at_end=True,
-                metric_for_best_model="accuracy",
-                compute_metrics=AlmostAccuracy(),
-            )
-            self.assertTrue(trainer.args.greater_is_better)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 64 // self.batch_size, total)
-            self.check_best_model_has_been_loaded(
-                tmpdir, 64 // self.batch_size, total, trainer, "eval_accuracy", greater_is_better=True
-            )
-
-        # Test this works with a non PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                learning_rate=0.1,
-                eval_steps=5,
-                evaluation_strategy="steps",
-                save_steps=5,
-                load_best_model_at_end=True,
-                pretrained=False,
-            )
-            self.assertFalse(trainer.args.greater_is_better)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=False)
-            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss", is_pretrained=False)
-
-    @require_safetensors
-    def test_load_best_model_from_safetensors(self):
-        total = int(self.n_epochs * 64 / self.batch_size)
-        for save_safetensors, pretrained in product([False, True], [False, True]):
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = get_regression_trainer(
-                    a=1.5,
-                    b=2.5,
-                    output_dir=tmpdir,
-                    learning_rate=0.1,
-                    eval_steps=5,
-                    evaluation_strategy="steps",
-                    save_steps=5,
-                    load_best_model_at_end=True,
-                    save_safetensors=save_safetensors,
-                    pretrained=pretrained,
-                )
-                self.assertFalse(trainer.args.greater_is_better)
-                trainer.train()
-                self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=pretrained, safe_weights=save_safetensors)
-                self.check_best_model_has_been_loaded(
-                    tmpdir, 5, total, trainer, "eval_loss", is_pretrained=pretrained, safe_weights=save_safetensors
-                )
-
-    def test_training_iterable_dataset(self):
-        config = RegressionModelConfig()
-        model = RegressionPreTrainedModel(config)
-        # Adding one column not used by the model should have no impact
-        train_dataset = SampleIterableDataset(label_names=["labels", "extra"])
-
-        args = RegressionGaudiTrainingArguments(
-            output_dir="./examples", max_steps=4, use_habana=True, use_lazy_mode=True
-        )
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, train_dataset=train_dataset)
-        trainer.train()
-        self.assertEqual(trainer.state.global_step, 4)
-
-        loader = trainer.get_train_dataloader()
-        self.assertIsInstance(loader, torch.utils.data.DataLoader)
-        self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
-
-    def test_evaluation_iterable_dataset(self):
-        config = RegressionModelConfig(a=1.5, b=2.5)
-        model = RegressionPreTrainedModel(config)
-        # Adding one column not used by the model should have no impact
-        eval_dataset = SampleIterableDataset(label_names=["labels", "extra"])
-
-        args = RegressionGaudiTrainingArguments(output_dir="./examples", use_habana=True, use_lazy_mode=True)
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(
-            model=model,
-            gaudi_config=gaudi_config,
-            args=args,
-            eval_dataset=eval_dataset,
-            compute_metrics=AlmostAccuracy(),
-        )
-        results = trainer.evaluate()
-
-        x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-        # With a number of elements not a round multiple of the batch size
-        eval_dataset = SampleIterableDataset(length=66)
-        results = trainer.evaluate(eval_dataset)
-
-        x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0]
-        pred = 1.5 * x + 2.5
-        expected_loss = ((pred - y) ** 2).mean()
-        self.assertAlmostEqual(results["eval_loss"], expected_loss)
-        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
-        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
-
-    def test_predict_iterable_dataset(self):
-        config = RegressionModelConfig(a=1.5, b=2.5)
-        model = RegressionPreTrainedModel(config)
-        eval_dataset = SampleIterableDataset()
-
-        args = RegressionGaudiTrainingArguments(output_dir="./examples", use_habana=True, use_lazy_mode=True)
-        gaudi_config = get_gaudi_config()
-        trainer = GaudiTrainer(
-            model=model,
-            gaudi_config=gaudi_config,
-            args=args,
-            eval_dataset=eval_dataset,
-            compute_metrics=AlmostAccuracy(),
-        )
-
-        preds = trainer.predict(trainer.eval_dataset).predictions
-        x = eval_dataset.dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-        # With a number of elements not a round multiple of the batch size
-        # Adding one column not used by the model should have no impact
-        test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"])
-        preds = trainer.predict(test_dataset).predictions
-        x = test_dataset.dataset.x
-        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
-
-    def test_num_train_epochs_in_training(self):
-        # len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given.
-        # It should give 1 update step for each epoch.
-        trainer = get_regression_trainer(
-            max_steps=3, train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5
-        )
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, 3)
-
-        # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
-        # len(train_dl) < gradient_accumulation_steps.
-        trainer = get_regression_trainer(train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5)
-        train_output = trainer.train()
-        self.assertEqual(train_output.global_step, int(self.n_epochs))
-
-    def test_early_stopping_callback(self):
-        # early stopping stops training before num_training_epochs
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=tmp_dir,
-                num_train_epochs=20,
-                gradient_accumulation_steps=1,
-                per_device_train_batch_size=16,
-                load_best_model_at_end=True,
-                evaluation_strategy=IntervalStrategy.EPOCH,
-                save_strategy=IntervalStrategy.EPOCH,
-                compute_metrics=AlmostAccuracy(),
-                metric_for_best_model="accuracy",
-            )
-            trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
-            train_output = trainer.train()
-            self.assertLess(train_output.global_step, 20 * 64 / 16)
-
-        # Invalid inputs to trainer with early stopping callback result in assertion error
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=tmp_dir,
-                num_train_epochs=20,
-                gradient_accumulation_steps=1,
-                per_device_train_batch_size=16,
-                evaluation_strategy=IntervalStrategy.EPOCH,
-                compute_metrics=AlmostAccuracy(),
-                metric_for_best_model="accuracy",
-            )
-            trainer.add_callback(EarlyStoppingCallback(1))
-            self.assertEqual(trainer.state.global_step, 0)
-            try:
-                trainer.train()
-            except AssertionError:
-                self.assertEqual(trainer.state.global_step, 0)
-
-    def test_flos_extraction(self):
-        trainer = get_regression_trainer(learning_rate=0.1)
-
-        def assert_flos_extraction(trainer, wrapped_model_to_check):
-            self.assertEqual(trainer.model, unwrap_model(wrapped_model_to_check))
-            self.assertGreaterEqual(getattr(unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0)
-
-        # with plain model
-        assert_flos_extraction(trainer, trainer.model)
-
-        # # with enforced DataParallel
-        # assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
-
-        trainer.train()
-        self.assertTrue(isinstance(trainer.state.total_flos, float))
-
-    def check_checkpoint_deletion(self, trainer, output_dir, expected):
-        # Make fake checkpoints
-        for n in [5, 10, 15, 20, 25]:
-            os.makedirs(os.path.join(output_dir, f"{PREFIX_CHECKPOINT_DIR}-{n}"), exist_ok=True)
-        trainer._rotate_checkpoints(output_dir=output_dir)
-        glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{PREFIX_CHECKPOINT_DIR}-*")]
-        values = [int(re.match(f".*{PREFIX_CHECKPOINT_DIR}-([0-9]+)", d).groups()[0]) for d in glob_checkpoints]
-        self.assertSetEqual(set(values), set(expected))
-
-    def test_checkpoint_rotation(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # Without best model at end
-            trainer = get_regression_trainer(output_dir=tmp_dir, save_total_limit=2)
-            self.check_checkpoint_deletion(trainer, tmp_dir, [20, 25])
-
-            # With best model at end
-            trainer = get_regression_trainer(
-                output_dir=tmp_dir, evaluation_strategy="steps", load_best_model_at_end=True, save_total_limit=2
-            )
-            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5")
-            self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25])
-
-            # Edge case: we don't always honor save_total_limit=1 if load_best_model_at_end=True to be able to resume
-            # from checkpoint
-            trainer = get_regression_trainer(
-                output_dir=tmp_dir, evaluation_strategy="steps", load_best_model_at_end=True, save_total_limit=1
-            )
-            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-25")
-            self.check_checkpoint_deletion(trainer, tmp_dir, [25])
-
-            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5")
-            self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25])
-
-    def check_mem_metrics(self, trainer, check_func):
-        metrics = trainer.train().metrics
-        check_func("init_mem_cpu_alloc_delta", metrics)
-        check_func("train_mem_cpu_alloc_delta", metrics)
-        if torch.cuda.device_count() > 0:
-            check_func("init_mem_gpu_alloc_delta", metrics)
-            check_func("train_mem_gpu_alloc_delta", metrics)
-
-        metrics = trainer.evaluate()
-        check_func("eval_mem_cpu_alloc_delta", metrics)
-        if torch.cuda.device_count() > 0:
-            check_func("eval_mem_gpu_alloc_delta", metrics)
-
-        metrics = trainer.predict(RegressionDataset()).metrics
-        check_func("test_mem_cpu_alloc_delta", metrics)
-        if torch.cuda.device_count() > 0:
-            check_func("test_mem_gpu_alloc_delta", metrics)
-
-    def test_mem_metrics(self):
-        # with mem metrics enabled
-        trainer = get_regression_trainer(skip_memory_metrics=False)
-        self.check_mem_metrics(trainer, self.assertIn)
-
-        # with mem metrics disabled
-        trainer = get_regression_trainer(skip_memory_metrics=True)
-        self.check_mem_metrics(trainer, self.assertNotIn)
-
-    def test_no_wd_param_group(self):
-        model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
-        gaudi_config = get_gaudi_config()
-        args = GaudiTrainingArguments(output_dir="./test", use_habana=True, use_lazy_mode=True)
-        trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args)
-        trainer.create_optimizer_and_scheduler(10)
-        wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight']  # fmt: skip
-        wd_params = [p for n, p in model.named_parameters() if n in wd_names]
-        no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names]
-        self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params)
-        self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
-
-    def test_accelerator_config_empty(self):
-        # Checks that a config can be made with the defaults if not passed
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config = RegressionModelConfig(a=1.5, b=2.5)
-            model = RegressionPreTrainedModel(config)
-            eval_dataset = SampleIterableDataset()
-
-            # Leaves one option as something *not* basic
-            gaudi_config = get_gaudi_config()
-            args = RegressionGaudiTrainingArguments(output_dir=tmp_dir, use_habana=True)
-            trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.split_batches, False)
-            self.assertEqual(trainer.accelerator.dispatch_batches, None)
-            self.assertEqual(trainer.accelerator.even_batches, True)
-            self.assertEqual(trainer.accelerator.use_seedable_sampler, True)
-
-            if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE:
-                # gradient accumulation kwargs configures gradient_state
-                self.assertNotIn("sync_each_batch", trainer.accelerator.gradient_state.plugin_kwargs)
-
-    def test_accelerator_config_from_dict(self):
-        # Checks that accelerator kwargs can be passed through
-        # and the accelerator is initialized respectively
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config = RegressionModelConfig(a=1.5, b=2.5)
-            model = RegressionPreTrainedModel(config)
-            eval_dataset = SampleIterableDataset()
-
-            accelerator_config = {
-                "split_batches": True,
-                "dispatch_batches": True,
-                "even_batches": False,
-                "use_seedable_sampler": True,
-            }
-            if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE:
-                accelerator_config["gradient_accumulation_kwargs"] = {"sync_each_batch": True}
-
-            # Leaves all options as something *not* basic
-            gaudi_config = get_gaudi_config()
-            args = RegressionGaudiTrainingArguments(
-                output_dir=tmp_dir,
-                accelerator_config=accelerator_config,
-                use_habana=True,
-            )
-            trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.split_batches, True)
-            self.assertEqual(trainer.accelerator.dispatch_batches, True)
-            self.assertEqual(trainer.accelerator.even_batches, False)
-            self.assertEqual(trainer.accelerator.use_seedable_sampler, True)
-
-            if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE:
-                self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True)
-
-    def test_accelerator_config_from_yaml(self):
-        # Checks that accelerator kwargs can be passed through
-        # and the accelerator is initialized respectively
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            path_file = Path(tmp_dir) / "accelerator_config.json"
-            with open(path_file, "w") as f:
-                accelerator_config = {
-                    "split_batches": True,
-                    "dispatch_batches": True,
-                    "even_batches": False,
-                    "use_seedable_sampler": False,
-                }
-                if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE:
-                    accelerator_config["gradient_accumulation_kwargs"] = {"sync_each_batch": True}
-                json.dump(accelerator_config, f)
-            config = RegressionModelConfig(a=1.5, b=2.5)
-            model = RegressionPreTrainedModel(config)
-            eval_dataset = SampleIterableDataset()
-
-            # Leaves all options as something *not* basic
-            gaudi_config = get_gaudi_config()
-            args = RegressionGaudiTrainingArguments(output_dir=tmp_dir, accelerator_config=path_file, use_habana=True)
-            trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.split_batches, True)
-            self.assertEqual(trainer.accelerator.dispatch_batches, True)
-            self.assertEqual(trainer.accelerator.even_batches, False)
-            self.assertEqual(trainer.accelerator.use_seedable_sampler, False)
-
-            if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE:
-                self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True)
-
-    def test_accelerator_config_from_dataclass(self):
-        # Checks that accelerator kwargs can be passed through
-        # and the accelerator is initialized respectively
-
-        accelerator_config = AcceleratorConfig(
-            split_batches=True,
-            dispatch_batches=True,
-            even_batches=False,
-            use_seedable_sampler=False,
-        )
-        config = RegressionModelConfig(a=1.5, b=2.5)
-        model = RegressionPreTrainedModel(config)
-        eval_dataset = SampleIterableDataset()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            gaudi_config = get_gaudi_config()
-            args = RegressionGaudiTrainingArguments(
-                output_dir=tmp_dir, accelerator_config=accelerator_config, use_habana=True
-            )
-            trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.split_batches, True)
-            self.assertEqual(trainer.accelerator.dispatch_batches, True)
-            self.assertEqual(trainer.accelerator.even_batches, False)
-            self.assertEqual(trainer.accelerator.use_seedable_sampler, False)
-
-    @require_accelerate_version_min_0_28
-    def test_accelerate_config_from_dataclass_grad_accum(self):
-        # Checks that accelerator kwargs can be passed through
-        # and the accelerator is initialized respectively
-
-        grad_acc_kwargs = {
-            "num_steps": 10,
-            "adjust_scheduler": False,
-            "sync_with_dataloader": False,
-            "sync_each_batch": True,
-        }
-        accelerator_config = AcceleratorConfig(
-            split_batches=True,
-            dispatch_batches=True,
-            even_batches=False,
-            use_seedable_sampler=False,
-            gradient_accumulation_kwargs=grad_acc_kwargs,
-        )
-        config = RegressionModelConfig(a=1.5, b=2.5)
-        model = RegressionPreTrainedModel(config)
-        eval_dataset = SampleIterableDataset()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = RegressionGaudiTrainingArguments(output_dir=tmp_dir, accelerator_config=accelerator_config)
-            trainer = GaudiTrainer(model=model, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["num_steps"], 10)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["adjust_scheduler"], False)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_with_dataloader"], False)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True)
-
-    def test_accelerator_config_from_partial(self):
-        # Checks that accelerator kwargs can be passed through
-        # and the accelerator is initialized respectively
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config = RegressionModelConfig(a=1.5, b=2.5)
-            model = RegressionPreTrainedModel(config)
-            eval_dataset = SampleIterableDataset()
-
-            # Leaves one option as something *not* basic
-            gaudi_config = get_gaudi_config()
-            args = RegressionGaudiTrainingArguments(
-                output_dir=tmp_dir,
-                accelerator_config={
-                    "split_batches": True,
-                },
-                use_habana=True,
-            )
-            trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.split_batches, True)
-            self.assertEqual(trainer.accelerator.dispatch_batches, None)
-            self.assertEqual(trainer.accelerator.even_batches, True)
-            self.assertEqual(trainer.accelerator.use_seedable_sampler, True)
-
-    def test_accelerator_config_from_dict_with_deprecated_args(self):
-        # Checks that accelerator kwargs can be passed through
-        # and the accelerator is initialized respectively
-        # and maintains the deprecated args if passed in
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config = RegressionModelConfig(a=1.5, b=2.5)
-            model = RegressionPreTrainedModel(config)
-            eval_dataset = SampleIterableDataset()
-
-            # Leaves all options as something *not* basic
-            with self.assertWarns(FutureWarning) as cm:
-                gaudi_config = get_gaudi_config()
-                args = RegressionGaudiTrainingArguments(
-                    output_dir=tmp_dir,
-                    accelerator_config={
-                        "split_batches": True,
-                    },
-                    dispatch_batches=False,
-                    use_habana=True,
-                )
-                self.assertIn("dispatch_batches", str(cm.warnings[0].message))
-            trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.dispatch_batches, False)
-            self.assertEqual(trainer.accelerator.split_batches, True)
-            with self.assertWarns(FutureWarning) as cm:
-                args = RegressionGaudiTrainingArguments(
-                    output_dir=tmp_dir,
-                    accelerator_config={
-                        "even_batches": False,
-                    },
-                    split_batches=True,
-                    use_habana=True,
-                )
-                self.assertIn("split_batches", str(cm.warnings[0].message))
-            trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.split_batches, True)
-            self.assertEqual(trainer.accelerator.even_batches, False)
-            self.assertEqual(trainer.accelerator.dispatch_batches, None)
-
-    def test_accelerator_config_only_deprecated_args(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertWarns(FutureWarning) as cm:
-                gaudi_config = get_gaudi_config()
-                args = RegressionGaudiTrainingArguments(
-                    output_dir=tmp_dir,
-                    split_batches=True,
-                    use_habana=True,
-                )
-                self.assertIn("split_batches", str(cm.warnings[0].message))
-                config = RegressionModelConfig(a=1.5, b=2.5)
-                model = RegressionPreTrainedModel(config)
-                eval_dataset = SampleIterableDataset()
-                trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset)
-                self.assertEqual(trainer.accelerator.split_batches, True)
-
-    @require_accelerate_version_min_0_28
-    def test_accelerator_config_from_dict_grad_accum_num_steps(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config = RegressionModelConfig(a=1.5, b=2.5)
-            model = RegressionPreTrainedModel(config)
-            eval_dataset = SampleIterableDataset()
-
-            # case - TrainingArguments.gradient_accumulation_steps == 1
-            #      - gradient_accumulation_kwargs['num_steps] == 1
-            # results in grad accum set to 1
-            args = RegressionGaudiTrainingArguments(
-                output_dir=tmp_dir,
-                gradient_accumulation_steps=1,
-                accelerator_config={
-                    "gradient_accumulation_kwargs": {
-                        "num_steps": 1,
-                    }
-                },
-            )
-            trainer = GaudiTrainer(model=model, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["num_steps"], 1)
-
-            # case - TrainingArguments.gradient_accumulation_steps > 1
-            #      - gradient_accumulation_kwargs['num_steps] specified
-            # results in exception raised
-            args = RegressionGaudiTrainingArguments(
-                output_dir=tmp_dir,
-                gradient_accumulation_steps=2,
-                accelerator_config={
-                    "gradient_accumulation_kwargs": {
-                        "num_steps": 10,
-                    }
-                },
-            )
-            with self.assertRaises(Exception) as context:
-                trainer = GaudiTrainer(model=model, args=args, eval_dataset=eval_dataset)
-            self.assertTrue("The `AcceleratorConfig`'s `num_steps` is set but" in str(context.exception))
-
-    def test_accelerator_config_not_instantiated(self):
-        # Checks that accelerator kwargs can be passed through
-        # and the accelerator is initialized respectively
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(NotImplementedError) as context:
-                _ = RegressionGaudiTrainingArguments(
-                    output_dir=tmp_dir,
-                    accelerator_config=AcceleratorConfig,
-                    use_habana=True,
-                    use_lazy_mode=True,
-                )
-            self.assertTrue("Tried passing in a callable to `accelerator_config`" in str(context.exception))
-
-        # Now test with a custom subclass
-        @dataclasses.dataclass
-        class CustomAcceleratorConfig(AcceleratorConfig):
-            pass
-
-        @dataclasses.dataclass
-        class CustomTrainingArguments(GaudiTrainingArguments):
-            accelerator_config: dict = dataclasses.field(
-                default=CustomAcceleratorConfig,
-            )
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(NotImplementedError) as context:
-                _ = CustomTrainingArguments(
-                    output_dir=tmp_dir,
-                    use_habana=True,
-                    use_lazy_mode=True,
-                )
-            self.assertTrue("Tried passing in a callable to `accelerator_config`" in str(context.exception))
-
-    def test_profiling(self):
-        # 24 total steps and compilation takes place during the 1st three steps
-        trainer = get_regression_trainer(profiling_warmup_steps=3, profiling_steps=21)
-        trainer.train()
-
-
-@require_torch
-@is_staging_test
-class GaudiTrainerIntegrationWithHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        for model in [
-            "test-trainer",
-            "test-trainer-epoch",
-            "test-trainer-step",
-            "test-trainer-tensorboard",
-            "test-trainer-tags",
-        ]:
-            try:
-                delete_repo(token=cls._token, repo_id=model)
-            except HTTPError:
-                pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-trainer-org")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=os.path.join(tmp_dir, "test-trainer"),
-                push_to_hub=True,
-                hub_token=self._token,
-            )
-            url = trainer.push_to_hub()
-
-            # Extract repo_name from the url
-            re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
-            self.assertTrue(re_search is not None)
-            repo_name = re_search.groups()[0]
-
-            self.assertEqual(repo_name, f"{USER}/test-trainer")
-
-            model = RegressionPreTrainedModel.from_pretrained(repo_name)
-            self.assertEqual(model.a.item(), trainer.model.a.item())
-            self.assertEqual(model.b.item(), trainer.model.b.item())
-
-    def test_push_to_hub_in_organization(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(output_dir=tmp_dir)
-            trainer.save_model()
-            trainer = get_regression_trainer(
-                output_dir=os.path.join(tmp_dir, "test-trainer-org"),
-                push_to_hub=True,
-                hub_model_id="valid_org/test-trainer-org",
-                hub_token=self._token,
-            )
-            url = trainer.push_to_hub()
-
-            # Extract repo_name from the url
-            re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
-            self.assertTrue(re_search is not None)
-            repo_name = re_search.groups()[0]
-            self.assertEqual(repo_name, "valid_org/test-trainer-org")
-
-            model = RegressionPreTrainedModel.from_pretrained("valid_org/test-trainer-org")
-            self.assertEqual(model.a.item(), trainer.model.a.item())
-            self.assertEqual(model.b.item(), trainer.model.b.item())
-
-    def get_commit_history(self, repo):
-        commit_logs = subprocess.run(
-            "git log".split(),
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            check=True,
-            encoding="utf-8",
-            cwd=repo,
-        ).stdout
-        commits = commit_logs.split("\n\n")[1::2]
-        return [commit.strip() for commit in commits]
-
-    def test_push_to_hub_with_saves_each_epoch(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
-                push_to_hub=True,
-                hub_token=self._token,
-                # To avoid any flakiness if the training goes faster than the uploads.
-                hub_always_push=True,
-                save_strategy="epoch",
-            )
-            trainer.train()
-
-        commits = list_repo_commits(f"{USER}/test-trainer-epoch", token=self._token)
-        commits = [c.title for c in commits]
-        self.assertIn("initial commit", commits)
-        for i in range(1, 4):
-            self.assertIn(f"Training in progress, epoch {i}", commits)
-
-    def test_push_to_hub_with_saves_each_n_steps(self):
-        num_gpus = max(1, get_gpu_count())
-        if num_gpus > 2:
-            return
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=os.path.join(tmp_dir, "test-trainer-step"),
-                push_to_hub=True,
-                hub_token=self._token,
-                # To avoid any flakiness if the training goes faster than the uploads.
-                hub_always_push=True,
-                save_strategy="steps",
-                save_steps=5,
-            )
-            trainer.train()
-
-        commits = list_repo_commits(f"{USER}/test-trainer-step", token=self._token)
-        commits = [c.title for c in commits]
-        self.assertIn("initial commit", commits)
-
-        # max_steps depend on the number of available GPUs
-        max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
-        for i in range(5, max_steps, 5):
-            self.assertIn(f"Training in progress, step {i}", commits)
-
-    @require_tensorboard
-    def test_push_to_hub_with_tensorboard_logs(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=os.path.join(tmp_dir, "test-trainer-tensorboard"),
-                hub_token=self._token,
-                save_strategy="epoch",
-                report_to=["tensorboard"],
-                keep_report_to=True,
-            )
-            trainer.train()
-            # Push the runs via `push_to_hub()`
-            trainer.push_to_hub()
-
-        files = list_repo_files(f"{USER}/test-trainer-tensorboard", token=self._token)
-        found_log = False
-        for f in files:
-            if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
-                found_log = True
-
-        assert found_log is True, "No tensorboard log found in repo"
-
-    def test_push_to_hub_tags(self):
-        # Checks if `trainer.push_to_hub()` works correctly by adding the desired
-        # tag without having to pass `tags` in `push_to_hub`
-        # see:
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=os.path.join(tmp_dir, "test-trainer-tags"),
-                push_to_hub=True,
-                hub_token=self._token,
-            )
-
-            trainer.model.add_model_tags(["test-trainer-tags"])
-
-            url = trainer.push_to_hub()
-
-            # Extract repo_name from the url
-            re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
-            self.assertTrue(re_search is not None)
-            repo_name = re_search.groups()[0]
-
-            self.assertEqual(repo_name, f"{USER}/test-trainer-tags")
-
-            model_card = ModelCard.load(repo_name)
-            self.assertTrue("test-trainer-tags" in model_card.data.tags)
-
-
-@require_torch
-@require_optuna
-class GaudiTrainerHyperParameterOptunaIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        args = GaudiTrainingArguments("..", use_habana=True, use_lazy_mode=True)
-        self.n_epochs = args.num_train_epochs
-        self.batch_size = args.train_batch_size
-
-    def test_hyperparameter_search(self):
-        class MyTrialShortNamer(TrialShortNamer):
-            DEFAULTS = {"a": 0, "b": 0}
-
-        def hp_space(trial):
-            return {}
-
-        def model_init(trial):
-            if trial is not None:
-                a = trial.suggest_int("a", -4, 4)
-                b = trial.suggest_int("b", -4, 4)
-            else:
-                a = 0
-                b = 0
-            config = RegressionModelConfig(a=a, b=b, double_output=False)
-
-            return RegressionPreTrainedModel(config)
-
-        def hp_name(trial):
-            return MyTrialShortNamer.shortname(trial.params)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=tmp_dir,
-                learning_rate=0.1,
-                logging_steps=1,
-                evaluation_strategy=IntervalStrategy.EPOCH,
-                save_strategy=IntervalStrategy.EPOCH,
-                num_train_epochs=4,
-                disable_tqdm=True,
-                load_best_model_at_end=True,
-                logging_dir="runs",
-                run_name="test",
-                model_init=model_init,
-            )
-            trainer.hyperparameter_search(direction="minimize", hp_space=hp_space, hp_name=hp_name, n_trials=4)
-
-
-@require_torch
-@require_optuna
-class TrainerHyperParameterMultiObjectOptunaIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        args = GaudiTrainingArguments("..", use_habana=True, use_lazy_mode=True)
-        self.n_epochs = args.num_train_epochs
-        self.batch_size = args.train_batch_size
-
-    def test_hyperparameter_search(self):
-        class MyTrialShortNamer(TrialShortNamer):
-            DEFAULTS = {"a": 0, "b": 0}
-
-        def hp_space(trial):
-            return {}
-
-        def model_init(trial):
-            if trial is not None:
-                a = trial.suggest_int("a", -4, 4)
-                b = trial.suggest_int("b", -4, 4)
-            else:
-                a = 0
-                b = 0
-            config = RegressionModelConfig(a=a, b=b, double_output=False)
-
-            return RegressionPreTrainedModel(config)
-
-        def hp_name(trial):
-            return MyTrialShortNamer.shortname(trial.params)
-
-        def compute_objective(metrics: Dict[str, float]) -> List[float]:
-            return metrics["eval_loss"], metrics["eval_accuracy"]
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(
-                output_dir=tmp_dir,
-                learning_rate=0.1,
-                logging_steps=1,
-                evaluation_strategy=IntervalStrategy.EPOCH,
-                save_strategy=IntervalStrategy.EPOCH,
-                num_train_epochs=10,
-                disable_tqdm=True,
-                load_best_model_at_end=True,
-                logging_dir="runs",
-                run_name="test",
-                model_init=model_init,
-                compute_metrics=AlmostAccuracy(),
-            )
-            trainer.hyperparameter_search(
-                direction=["minimize", "maximize"],
-                hp_space=hp_space,
-                hp_name=hp_name,
-                n_trials=4,
-                compute_objective=compute_objective,
-            )
-
-
-# TODO: crashes because `TypeError: cannot pickle 'PyCapsule' object`
-# @require_torch
-# @require_ray
-# class GaudiTrainerHyperParameterRayIntegrationTest(unittest.TestCase):
-#     def setUp(self):
-#         args = GaudiTrainingArguments("..", use_habana=True, use_lazy_mode=True)
-#         self.n_epochs = args.num_train_epochs
-#         self.batch_size = args.train_batch_size
-
-#     def ray_hyperparameter_search(self):
-#         class MyTrialShortNamer(TrialShortNamer):
-#             DEFAULTS = {"a": 0, "b": 0}
-
-#         def hp_space(trial):
-#             from ray import tune
-
-#             return {
-#                 "a": tune.randint(-4, 4),
-#                 "b": tune.randint(-4, 4),
-#             }
-
-#         def model_init(config):
-#             if config is None:
-#                 a = 0
-#                 b = 0
-#             else:
-#                 a = config["a"]
-#                 b = config["b"]
-#             model_config = RegressionModelConfig(a=a, b=b, double_output=False)
-
-#             return RegressionPreTrainedModel(model_config)
-
-#         def hp_name(params):
-#             return MyTrialShortNamer.shortname(params)
-
-#         with tempfile.TemporaryDirectory() as tmp_dir:
-#             trainer = get_regression_trainer(
-#                 output_dir=tmp_dir,
-#                 learning_rate=0.1,
-#                 logging_steps=1,
-#                 evaluation_strategy=IntervalStrategy.EPOCH,
-#                 save_strategy=IntervalStrategy.EPOCH,
-#                 num_train_epochs=4,
-#                 disable_tqdm=True,
-#                 load_best_model_at_end=True,
-#                 logging_dir="runs",
-#                 run_name="test",
-#                 model_init=model_init,
-#             )
-#             trainer.hyperparameter_search(
-#                 direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="ray", n_trials=4
-#             )
-
-#     def test_hyperparameter_search(self):
-#         self.ray_hyperparameter_search()
-
-#     def test_hyperparameter_search_ray_client(self):
-#         import ray
-#         from ray.util.client.ray_client_helpers import ray_start_client_server
-
-#         with ray_start_client_server():
-#             assert ray.util.client.ray.is_connected()
-#             self.ray_hyperparameter_search()
-
-
-# TODO: enable this test when a SIGOPT_API_TOKEN is added to Github Actions secrets
-# @require_torch
-# @require_sigopt
-# class GaudiTrainerHyperParameterSigOptIntegrationTest(unittest.TestCase):
-#     def setUp(self):
-#         args = GaudiTrainingArguments("..", use_habana=True, use_lazy_mode=True)
-#         self.n_epochs = args.num_train_epochs
-#         self.batch_size = args.train_batch_size
-
-#     def test_hyperparameter_search(self):
-#         class MyTrialShortNamer(TrialShortNamer):
-#             DEFAULTS = {"a": 0, "b": 0}
-
-#         def hp_space(trial):
-#             return [
-#                 {"bounds": {"min": -4, "max": 4}, "name": "a", "type": "int"},
-#                 {"bounds": {"min": -4, "max": 4}, "name": "b", "type": "int"},
-#             ]
-
-#         def model_init(trial):
-#             if trial is not None:
-#                 a = trial.assignments["a"]
-#                 b = trial.assignments["b"]
-#             else:
-#                 a = 0
-#                 b = 0
-#             config = RegressionModelConfig(a=a, b=b, double_output=False)
-
-#             return RegressionPreTrainedModel(config)
-
-#         def hp_name(trial):
-#             return MyTrialShortNamer.shortname(trial.assignments)
-
-#         with tempfile.TemporaryDirectory() as tmp_dir:
-#             trainer = get_regression_trainer(
-#                 output_dir=tmp_dir,
-#                 learning_rate=0.1,
-#                 logging_steps=1,
-#                 evaluation_strategy=IntervalStrategy.EPOCH,
-#                 save_strategy=IntervalStrategy.EPOCH,
-#                 num_train_epochs=4,
-#                 disable_tqdm=True,
-#                 load_best_model_at_end=True,
-#                 logging_dir="runs",
-#                 run_name="test",
-#                 model_init=model_init,
-#             )
-#             trainer.hyperparameter_search(
-#                 direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="sigopt", n_trials=4
-#             )
-
-
-optim_test_params = []
-if is_torch_available():
-    default_adam_kwargs = {
-        "betas": (GaudiTrainingArguments.adam_beta1, GaudiTrainingArguments.adam_beta2),
-        "eps": GaudiTrainingArguments.adam_epsilon,
-        "lr": GaudiTrainingArguments.learning_rate,
-    }
-
-    optim_test_params = [
-        (
-            OptimizerNames.ADAMW_HF,
-            transformers.optimization.AdamW,
-            default_adam_kwargs,
-        ),
-        (
-            OptimizerNames.ADAMW_HF.value,
-            transformers.optimization.AdamW,
-            default_adam_kwargs,
-        ),
-        (
-            OptimizerNames.ADAMW_TORCH,
-            torch.optim.AdamW,
-            default_adam_kwargs,
-        ),
-        (
-            OptimizerNames.ADAFACTOR,
-            transformers.optimization.Adafactor,
-            {
-                "scale_parameter": False,
-                "relative_step": False,
-                "lr": GaudiTrainingArguments.learning_rate,
-            },
-        ),
-    ]
-
-
-@require_torch
-class GaudiTrainerOptimizerChoiceTest(unittest.TestCase):
-    def check_optim_and_kwargs(self, optim: OptimizerNames, mandatory_kwargs, expected_cls):
-        args = GaudiTrainingArguments(optim=optim, output_dir="None", use_habana=True, use_lazy_mode=True)
-        actual_cls, optim_kwargs = GaudiTrainer.get_optimizer_cls_and_kwargs(args)
-        self.assertEqual(expected_cls, actual_cls)
-        self.assertIsNotNone(optim_kwargs)
-
-        for p, v in mandatory_kwargs.items():
-            self.assertTrue(p in optim_kwargs)
-            actual_v = optim_kwargs[p]
-            self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.")
-
-    @parameterized.expand(optim_test_params, skip_on_empty=True)
-    def test_optim_supported(self, name: str, expected_cls, mandatory_kwargs):
-        # exercises all the valid --optim options
-        self.check_optim_and_kwargs(name, mandatory_kwargs, expected_cls)
-
-        trainer = get_regression_trainer(optim=name)
-        trainer.gaudi_config.use_fused_adam = False
-        trainer.train()
-
-
-# TODO: solve the Git error returned by this test
-# @require_torch
-# @require_wandb
-# class GaudiTrainerHyperParameterWandbIntegrationTest(unittest.TestCase):
-#     def setUp(self):
-#         args = GaudiTrainingArguments("..", use_habana=True, use_lazy_mode=True)
-#         self.n_epochs = args.num_train_epochs
-#         self.batch_size = args.train_batch_size
-
-#     def test_hyperparameter_search(self):
-#         class MyTrialShortNamer(TrialShortNamer):
-#             DEFAULTS = {"a": 0, "b": 0}
-
-#         def hp_space(trial):
-#             return {
-#                 "method": "random",
-#                 "metric": {},
-#                 "parameters": {
-#                     "a": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
-#                     "b": {"distribution": "int_uniform", "min": 1, "max": 6},
-#                 },
-#             }
-
-#         def model_init(config):
-#             if config is None:
-#                 a = 0
-#                 b = 0
-#             else:
-#                 a = config["a"]
-#                 b = config["b"]
-#             model_config = RegressionModelConfig(a=a, b=b, double_output=False)
-
-#             return RegressionPreTrainedModel(model_config)
-
-#         def hp_name(params):
-#             return MyTrialShortNamer.shortname(params)
-
-#         with tempfile.TemporaryDirectory() as tmp_dir:
-#             trainer = get_regression_trainer(
-#                 output_dir=tmp_dir,
-#                 learning_rate=0.1,
-#                 logging_steps=1,
-#                 evaluation_strategy=IntervalStrategy.EPOCH,
-#                 save_strategy=IntervalStrategy.EPOCH,
-#                 num_train_epochs=4,
-#                 disable_tqdm=True,
-#                 load_best_model_at_end=True,
-#                 logging_dir="runs",
-#                 run_name="test",
-#                 model_init=model_init,
-#             )
-#             trainer.hyperparameter_search(
-#                 direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="wandb", n_trials=4, anonymous="must"
-#             )
-
-
-class HyperParameterSearchBackendsTest(unittest.TestCase):
-    def test_hyperparameter_search_backends(self):
-        self.assertEqual(
-            list(ALL_HYPERPARAMETER_SEARCH_BACKENDS.keys()),
-            list(HPSearchBackend),
-        )
-
-
-@require_torch
-class OptimizerAndModelInspectionTest(unittest.TestCase):
-    def test_get_num_trainable_parameters(self):
-        model = nn.Sequential(nn.Linear(128, 64), nn.Linear(64, 32))
-        args = GaudiTrainingArguments(
-            output_dir="tmp_trainer",
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        # in_features * out_features + bias
-        layer_1 = 128 * 64 + 64
-        layer_2 = 64 * 32 + 32
-        trainer = GaudiTrainer(model=model, gaudi_config=get_gaudi_config(), args=args)
-        self.assertEqual(trainer.get_num_trainable_parameters(), layer_1 + layer_2)
-        # Freeze the last layer
-        for param in model[-1].parameters():
-            param.requires_grad = False
-        self.assertEqual(trainer.get_num_trainable_parameters(), layer_1)
-
-    def test_get_learning_rates(self):
-        model = nn.Sequential(nn.Linear(128, 64))
-        args = GaudiTrainingArguments(
-            output_dir="tmp_trainer",
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        trainer = GaudiTrainer(model=model, gaudi_config=get_gaudi_config(), args=args)
-        with self.assertRaises(ValueError):
-            trainer.get_learning_rates()
-        trainer.create_optimizer()
-        self.assertEqual(trainer.get_learning_rates(), [5e-05, 5e-05])
-
-    def test_get_optimizer_group(self):
-        model = nn.Sequential(nn.Linear(128, 64))
-        args = GaudiTrainingArguments(
-            output_dir="tmp_trainer",
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        trainer = GaudiTrainer(model=model, gaudi_config=get_gaudi_config(), args=args)
-        # ValueError is raised if optimizer is None
-        with self.assertRaises(ValueError):
-            trainer.get_optimizer_group()
-        trainer.create_optimizer()
-        # Get groups
-        num_groups = len(trainer.get_optimizer_group())
-        self.assertEqual(num_groups, 2)
-        # Get group of parameter
-        param = next(model.parameters())
-        group = trainer.get_optimizer_group(param)
-        self.assertIn(param, group["params"])
diff --git a/tests/test_trainer_distributed.py b/tests/test_trainer_distributed.py
deleted file mode 100644
index 84413f9022..0000000000
--- a/tests/test_trainer_distributed.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pathlib import Path
-from typing import Dict
-
-from transformers import EvalPrediction, HfArgumentParser, is_torch_available
-from transformers.testing_utils import TestCasePlus
-
-from optimum.habana import GaudiConfig, GaudiTrainingArguments
-from optimum.habana.distributed import DistributedRunner
-from optimum.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-    from torch.utils.data import Dataset
-
-    from optimum.habana import GaudiTrainer
-
-    class DummyDataset(Dataset):
-        def __init__(self, length: int = 101):
-            self.length = length
-
-        def __len__(self):
-            return self.length
-
-        def __getitem__(self, i) -> int:
-            return i
-
-    class DummyDataCollator:
-        def __call__(self, features):
-            return {"input_ids": torch.tensor(features), "labels": torch.tensor(features)}
-
-    class DummyModel(nn.Module):
-        def __init__(self):
-            super().__init__()
-            # Add some (unused) params otherwise DDP will complain.
-            self.fc = nn.Linear(120, 80)
-
-        def forward(self, input_ids, labels=None):
-            if labels is not None:
-                return torch.tensor(0.0, device=input_ids.device), input_ids
-            else:
-                return input_ids
-
-    class RegressionModel(nn.Module):
-        def __init__(self, a=0, b=0, double_output=False):
-            super().__init__()
-            self.a = torch.nn.Parameter(torch.tensor(a).float())
-            self.b = torch.nn.Parameter(torch.tensor(b).float())
-            self.double_output = double_output
-            self.config = None
-
-        def forward(self, input_x, labels=None, **kwargs):
-            y = input_x * self.a + self.b
-            if labels is None:
-                return (y, y) if self.double_output else (y,)
-            loss = torch.nn.functional.mse_loss(y, labels)
-            return (loss, y, y) if self.double_output else (loss, y)
-
-
-class TestGaudiTrainerDistributed(TestCasePlus):
-    def _test_gaudi_trainer_distributed(self, kwargs={}):
-        output_dir = self.get_auto_remove_tmp_dir()
-
-        command_list = [f"{self.test_file_dir}/test_trainer_distributed.py"]
-        command_list += ["--output_dir"]
-        command_list += [output_dir]
-        command_list += ["--use_habana"]
-        command_list += ["--use_lazy_mode"]
-        for key, value in kwargs.items():
-            command_list += [f"--{key} {value}"]
-        command = [" ".join(command_list)]
-
-        distributed_runner = DistributedRunner(
-            command_list=command,
-            world_size=8,
-            use_mpi=True,
-        )
-
-        ret_code = distributed_runner.run()
-
-        # ret_code equals 0 or None if successful run
-        self.assertTrue(ret_code == 0 or ret_code is None)
-
-    def test_gaudi_trainer_distributed(self):
-        self._test_gaudi_trainer_distributed()
-
-    def test_gaudi_trainer_distributed_hpu_graphs(self):
-        self._test_gaudi_trainer_distributed(
-            {
-                "use_hpu_graphs_for_training": "",
-                "use_hpu_graphs_for_inference": "",
-                "distribution_strategy": "fast_ddp",
-            }
-        )
-
-
-if __name__ == "__main__":
-    # The script below is meant to be run under mpirun, on a machine with multiple HPUs:
-    #
-    # PYTHONPATH="src" python optimum-habana/examples/gaudi_spawn.py --world_size 8 --use_mpi --output_dir output_dir ./tests/test_trainer_distributed.py
-
-    parser = HfArgumentParser((GaudiTrainingArguments,))
-    training_args = parser.parse_args_into_dataclasses()[0]
-
-    gaudi_config_file = Path(__file__).parent.resolve() / Path("configs/gaudi_config_trainer_test.json")
-    gaudi_config = GaudiConfig.from_pretrained(gaudi_config_file)
-
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_hpu: {training_args.world_size},"
-        f" distributed training: {training_args.local_rank != -1}"
-    )
-
-    # Essentially, what we want to verify in the distributed case is that we get all samples back,
-    # in the right order. (this is crucial for prediction for instance)
-    for dataset_length in [101, 40, 7]:
-        dataset = DummyDataset(dataset_length)
-
-        def compute_metrics(p: EvalPrediction) -> Dict:
-            sequential = list(range(len(dataset)))
-            success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential
-            if not success and training_args.local_rank == 0:
-                logger.warning(
-                    "Predictions and/or labels do not match expected results:\n  - predictions: "
-                    f"{p.predictions.tolist()}\n  - labels: {p.label_ids.tolist()}\n  - expected: {sequential}"
-                )
-            return {"success": success}
-
-        trainer = GaudiTrainer(
-            model=DummyModel(),
-            gaudi_config=gaudi_config,
-            args=training_args,
-            data_collator=DummyDataCollator(),
-            eval_dataset=dataset,
-            compute_metrics=compute_metrics,
-        )
-
-        metrics = trainer.evaluate()
-        logger.info(metrics)
-        if metrics["eval_success"] is not True:
-            logger.error(metrics)
-            exit(1)
-
-        p = trainer.predict(dataset)
-        logger.info(p.metrics)
-        if p.metrics["test_success"] is not True:
-            logger.error(p.metrics)
-            exit(1)
-
-        trainer.args.eval_accumulation_steps = 2
-
-        metrics = trainer.evaluate()
-        logger.info(metrics)
-        if metrics["eval_success"] is not True:
-            logger.error(metrics)
-            exit(1)
-
-        p = trainer.predict(dataset)
-        logger.info(p.metrics)
-        if p.metrics["test_success"] is not True:
-            logger.error(p.metrics)
-            exit(1)
-
-        trainer.args.eval_accumulation_steps = None
diff --git a/tests/test_trainer_seq2seq.py b/tests/test_trainer_seq2seq.py
deleted file mode 100644
index 165ae0dcee..0000000000
--- a/tests/test_trainer_seq2seq.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, T5ForConditionalGeneration
-from transformers.testing_utils import TestCasePlus, require_torch
-from transformers.utils import is_datasets_available
-
-from optimum.habana import GaudiConfig, GaudiSeq2SeqTrainer, GaudiSeq2SeqTrainingArguments
-from optimum.habana.transformers.generation import GaudiGenerationConfig
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-if is_datasets_available():
-    import datasets
-
-
-adapt_transformers_to_gaudi()
-
-
-class GaudiSeq2seqTrainerTester(TestCasePlus):
-    @require_torch
-    def test_finetune_t5(self):
-        train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
-        val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")
-
-        train_dataset = train_dataset.select(range(32))
-        val_dataset = val_dataset.select(range(16))
-
-        batch_size = 4
-
-        training_args = GaudiSeq2SeqTrainingArguments(
-            output_dir=self.get_auto_remove_tmp_dir(),
-            gaudi_config_name="Habana/t5",
-            per_device_train_batch_size=batch_size,
-            per_device_eval_batch_size=batch_size,
-            predict_with_generate=True,
-            do_train=True,
-            do_eval=True,
-            use_habana=True,
-            use_lazy_mode=True,
-            use_hpu_graphs_for_inference=True,
-        )
-
-        model = T5ForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-t5-v1.1")
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
-
-        model.config.max_length = 128
-
-        def _map_to_encoder_decoder_inputs(batch):
-            # Tokenizer will automatically set [BOS] <text> [EOS]
-            inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512)
-            outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128)
-            batch["input_ids"] = inputs.input_ids
-            batch["attention_mask"] = inputs.attention_mask
-
-            batch["decoder_input_ids"] = outputs.input_ids
-            batch["labels"] = outputs.input_ids.copy()
-            batch["decoder_attention_mask"] = outputs.attention_mask
-
-            assert all(len(x) == 512 for x in inputs.input_ids)
-            assert all(len(x) == 128 for x in outputs.input_ids)
-
-            return batch
-
-        def _compute_metrics(pred):
-            labels_ids = pred.label_ids
-            pred_ids = pred.predictions
-
-            # all unnecessary tokens are removed
-            pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
-            label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
-
-            accuracy = sum([int(pred_str[i] == label_str[i]) for i in range(len(pred_str))]) / len(pred_str)
-
-            return {"accuracy": accuracy}
-
-        # map train dataset
-        train_dataset = train_dataset.map(
-            _map_to_encoder_decoder_inputs,
-            batched=True,
-            batch_size=batch_size,
-            remove_columns=["article", "highlights"],
-        )
-        train_dataset.set_format(
-            type="torch",
-            columns=["input_ids", "attention_mask", "decoder_input_ids", "labels"],
-        )
-
-        # same for validation dataset
-        val_dataset = val_dataset.map(
-            _map_to_encoder_decoder_inputs,
-            batched=True,
-            batch_size=batch_size,
-            remove_columns=["article", "highlights"],
-        )
-        val_dataset.set_format(
-            type="torch",
-            columns=["input_ids", "attention_mask", "decoder_input_ids", "labels"],
-        )
-
-        # instantiate trainer
-        trainer = GaudiSeq2SeqTrainer(
-            model=model,
-            args=training_args,
-            compute_metrics=_compute_metrics,
-            train_dataset=train_dataset,
-            eval_dataset=val_dataset,
-            tokenizer=tokenizer,
-        )
-
-        # start training
-        trainer.train()
-
-        # start evaluation using greedy search
-        trainer.evaluate(max_length=model.config.max_length, num_beams=1)
-
-        # start evaluation using beam search
-        trainer.evaluate(max_length=model.config.max_length, num_beams=2)
-
-    @require_torch
-    def test_bad_generation_config_fail_early(self):
-        # Tests that a bad geneartion config causes the trainer to fail early
-        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
-        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
-        gen_config = GaudiGenerationConfig(
-            do_sample=False, top_p=0.9
-        )  # bad: top_p is not compatible with do_sample=False
-
-        training_args = GaudiSeq2SeqTrainingArguments(
-            output_dir="tmp_trainer",
-            predict_with_generate=True,
-            generation_config=gen_config,
-            use_habana=True,
-            use_lazy_mode=True,
-        )
-        with self.assertRaises(ValueError) as exc:
-            _ = GaudiSeq2SeqTrainer(
-                model=model,
-                gaudi_config=GaudiConfig(),
-                args=training_args,
-                tokenizer=tokenizer,
-                data_collator=data_collator,
-                compute_metrics=lambda x: {"samples": x[0].shape[0]},
-            )
-        self.assertIn("The loaded generation config instance is invalid", str(exc.exception))
diff --git a/tests/test_trl.py b/tests/test_trl.py
deleted file mode 100644
index ebb64edf73..0000000000
--- a/tests/test_trl.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright 2023 metric-space, The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import torch
-from transformers.testing_utils import slow
-from trl import DDPOConfig
-
-from optimum.habana import GaudiConfig
-from optimum.habana.trl import GaudiDDPOTrainer, GaudiDefaultDDPOStableDiffusionPipeline
-
-
-def scorer_function(images, prompts, metadata):
-    return torch.randn(1) * 3.0, {}
-
-
-def prompt_function():
-    return ("cabbages", {})
-
-
-class GaudiDDPOTrainerTester(unittest.TestCase):
-    """
-    Test the GaudiDDPOTrainer class.
-
-    Adapted from https://github.com/huggingface/trl/blob/main/tests/test_ddpo_trainer.py
-    The main changes are:
-     - use GaudiDefaultDDPOStableDiffusionPipeline instead of DefaultDDPOStableDiffusionPipeline
-     - use GaudiDDPOTrainer instead of DDPOTrainer
-     - use bf16 instead of fp32
-     - combine test_generate_samples and test_calculate_loss in single test
-    """
-
-    def setUp(self):
-        self.ddpo_config = DDPOConfig(
-            num_epochs=2,
-            train_gradient_accumulation_steps=1,
-            per_prompt_stat_tracking_buffer_size=32,
-            sample_num_batches_per_epoch=2,
-            sample_batch_size=2,
-            mixed_precision=None,
-            save_freq=1000000,
-        )
-        pretrained_model = "hf-internal-testing/tiny-stable-diffusion-torch"
-        pretrained_revision = "main"
-
-        gaudi_config = GaudiConfig()
-        pipeline = GaudiDefaultDDPOStableDiffusionPipeline(
-            pretrained_model,
-            pretrained_model_revision=pretrained_revision,
-            use_lora=True,
-            gaudi_config=gaudi_config,
-            use_habana=True,
-            use_hpu_graphs=False,
-        )
-
-        self.trainer = GaudiDDPOTrainer(
-            self.ddpo_config,
-            scorer_function,
-            prompt_function,
-            pipeline,
-            gaudi_config=gaudi_config,
-            use_habana=True,
-            use_hpu_graphs=False,
-        )
-
-        return super().setUp()
-
-    def tearDown(self) -> None:
-        gc.collect()
-
-    def test_loss(self):
-        advantage = torch.tensor([-1.0])
-        clip_range = 0.0001
-        ratio = torch.tensor([1.0])
-        loss = self.trainer.loss(advantage, clip_range, ratio)
-        assert loss.item() == 1.0
-
-    @slow
-    def test_calculate_loss(self):
-        samples, output_pairs = self.trainer._generate_samples(1, 2)
-        assert len(samples) == 1
-        assert len(output_pairs) == 1
-        assert len(output_pairs[0][0]) == 2
-
-        sample = samples[0]
-        latents = sample["latents"][0, 0].unsqueeze(0)
-        next_latents = sample["next_latents"][0, 0].unsqueeze(0)
-        log_probs = sample["log_probs"][0, 0].unsqueeze(0)
-        timesteps = sample["timesteps"][0, 0].unsqueeze(0)
-        prompt_embeds = sample["prompt_embeds"]
-        advantage = torch.tensor([1.0], device=prompt_embeds.device)
-
-        assert latents.shape == (1, 4, 64, 64)
-        assert next_latents.shape == (1, 4, 64, 64)
-        assert log_probs.shape == (1,)
-        assert timesteps.shape == (1,)
-        assert prompt_embeds.shape == (2, 77, 32)
-        loss, approx_kl, clipfrac = self.trainer.calculate_loss(
-            latents, timesteps, next_latents, log_probs, advantage, prompt_embeds
-        )
-
-        assert torch.isfinite(loss.cpu())
-
-
-class GaudiDDPOTrainerWithLoRATester(GaudiDDPOTrainerTester):
-    """
-    Test the GaudiDDPOTrainer class.
-    """
-
-    def setUp(self):
-        self.ddpo_config = DDPOConfig(
-            num_epochs=2,
-            train_gradient_accumulation_steps=1,
-            per_prompt_stat_tracking_buffer_size=32,
-            sample_num_batches_per_epoch=2,
-            sample_batch_size=2,
-            mixed_precision=None,
-            save_freq=1000000,
-        )
-        pretrained_model = "hf-internal-testing/tiny-stable-diffusion-torch"
-        pretrained_revision = "main"
-
-        gaudi_config = GaudiConfig()
-        pipeline = GaudiDefaultDDPOStableDiffusionPipeline(
-            pretrained_model,
-            pretrained_model_revision=pretrained_revision,
-            use_lora=True,
-            gaudi_config=gaudi_config,
-            use_habana=True,
-            use_hpu_graphs=False,
-        )
-
-        self.trainer = GaudiDDPOTrainer(
-            self.ddpo_config,
-            scorer_function,
-            prompt_function,
-            pipeline,
-            gaudi_config=gaudi_config,
-            use_habana=True,
-            use_hpu_graphs=False,
-        )
-
-        return super().setUp()
diff --git a/tests/test_zero_shot_object_detection.py b/tests/test_zero_shot_object_detection.py
deleted file mode 100644
index a70f8f9f36..0000000000
--- a/tests/test_zero_shot_object_detection.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import time
-from unittest import TestCase
-
-import habana_frameworks.torch as ht
-import numpy as np
-import requests
-import torch
-from PIL import Image
-from transformers import OwlViTForObjectDetection, OwlViTProcessor
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-adapt_transformers_to_gaudi()
-
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    LATENCY_OWLVIT_BF16_GRAPH_BASELINE = 4.2139556878198333
-else:
-    # Gaudi1 CI baselines
-    LATENCY_OWLVIT_BF16_GRAPH_BASELINE = 8.460688591003418
-
-
-class GaudiOWlVITTester(TestCase):
-    """
-    Tests for Zero Shot Object Detection - OWLVIT
-    """
-
-    def prepare_model_and_processor(self):
-        model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to("hpu")
-        model = model.eval()
-        processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
-        return model, processor
-
-    def prepare_data(self):
-        texts = "a photo of a cat, a photo of a dog"
-        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-        return texts, image
-
-    def test_inference_default(self):
-        model, processor = self.prepare_model_and_processor()
-        texts, image = self.prepare_data()
-        inputs = processor(text=texts, images=image, return_tensors="pt").to("hpu")
-        outputs = model(**inputs)
-        target_sizes = torch.Tensor([image.size[::-1]])
-        results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
-        boxes = results[0]["boxes"]
-        self.assertEqual(len(boxes), 2)
-        expected_location = np.array([324.9933, 20.4362, 640.6164, 373.2621])
-        self.assertLess(np.abs(boxes[0].cpu().detach().numpy() - expected_location).max(), 1)
-
-    def test_inference_bf16(self):
-        model, processor = self.prepare_model_and_processor()
-        texts, image = self.prepare_data()
-        inputs = processor(text=texts, images=image, return_tensors="pt").to("hpu")
-
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16):  # Autocast BF16
-            outputs = model(**inputs)
-            target_sizes = torch.Tensor([image.size[::-1]])
-            results = processor.post_process_object_detection(
-                outputs=outputs, target_sizes=target_sizes, threshold=0.1
-            )
-            boxes = results[0]["boxes"]
-            expected_location = np.array([324.9933, 20.4362, 640.6164, 373.2621])
-            self.assertLess(np.abs(boxes[0].to(torch.float32).cpu().detach().numpy() - expected_location).max(), 2)
-
-    def test_inference_hpu_graphs(self):
-        model, processor = self.prepare_model_and_processor()
-        texts, image = self.prepare_data()
-        inputs = processor(text=texts, images=image, return_tensors="pt").to("hpu")
-
-        model = ht.hpu.wrap_in_hpu_graph(model)  # Apply graph
-
-        outputs = model(**inputs)
-        target_sizes = torch.Tensor([image.size[::-1]])
-        results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
-        boxes = results[0]["boxes"]
-        self.assertEqual(len(boxes), 2)
-        expected_location = np.array([324.9933, 20.4362, 640.6164, 373.2621])
-        self.assertLess(np.abs(boxes[0].to(torch.float32).cpu().detach().numpy() - expected_location).max(), 1)
-
-    def test_no_latency_regression_bf16(self):
-        warmup = 3
-        iterations = 10
-
-        model, processor = self.prepare_model_and_processor()
-        texts, image = self.prepare_data()
-
-        model = ht.hpu.wrap_in_hpu_graph(model)
-
-        with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
-            for i in range(warmup):
-                inputs = processor(text=texts, images=image, return_tensors="pt").to("hpu")
-                _ = model(**inputs)
-                torch.hpu.synchronize()
-
-            total_model_time = 0
-            for i in range(iterations):
-                inputs = processor(text=texts, images=image, return_tensors="pt").to("hpu")
-                model_start_time = time.time()
-                _ = model(**inputs)
-                torch.hpu.synchronize()
-                model_end_time = time.time()
-                total_model_time = total_model_time + (model_end_time - model_start_time)
-
-        latency = total_model_time * 1000 / iterations  # in terms of ms
-        self.assertLessEqual(latency, 1.05 * LATENCY_OWLVIT_BF16_GRAPH_BASELINE)
diff --git a/tests/transformers/tests/__init__.py b/tests/transformers/tests/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/generation/__init__.py b/tests/transformers/tests/generation/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/generation/test_beam_constraints.py b/tests/transformers/tests/generation/test_beam_constraints.py
deleted file mode 100644
index 5f45e4908f..0000000000
--- a/tests/transformers/tests/generation/test_beam_constraints.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch
-
-
-if is_torch_available():
-    import torch
-    from transformers.generation import DisjunctiveConstraint
-
-
-@require_torch
-class ConstraintTest(unittest.TestCase):
-    def test_input_types(self):
-        # For consistency across different places the DisjunctiveConstraint is called,
-        # dc.token_ids is a list of integers. It is also initialized only by integers.
-
-        cset = [[1, 2, 4], [1, 2, 3, 4]]
-        dc = DisjunctiveConstraint(cset)
-        self.assertTrue(isinstance(dc.token_ids, list))
-
-        with self.assertRaises(ValueError):
-            DisjunctiveConstraint(torch.LongTensor([[1, 2, 4], [1, 2, 3]]))
-
-        with self.assertRaises(ValueError):
-            DisjunctiveConstraint([torch.LongTensor([1, 2, 4]), torch.LongTensor([1, 2, 3, 4, 5])])
-
-    def test_check_illegal_input(self):
-        # We can't have constraints that are complete subsets of another. This leads to a preverse
-        # interpretation of "constraint fulfillment": does generating [1,2,3] fulfill the constraint?
-        # It would mean that it generated [1,2] which fulfills it, but it's in the middle of potentially
-        # fulfilling [1,2,3,4]. If we believe that [1,2,3] does fulfill the constraint, then the algorithm
-        # will necessarily never reach [1,2,3,4], giving users a false sense of control (better to just not allow it).
-        cset = [[1, 2], [1, 2, 3, 4]]
-
-        with self.assertRaises(ValueError):
-            DisjunctiveConstraint(cset)  # fails here
-
-    def test_example_progression(self):
-        cset = [[1, 2, 3], [1, 2, 4]]
-
-        dc = DisjunctiveConstraint(cset)
-
-        stepped, completed, reset = dc.update(1)
-        desired = stepped is True and completed is False and reset is False
-        self.assertTrue(desired)
-        self.assertTrue(not dc.completed)
-        self.assertTrue(dc.current_seq == [1])
-
-        stepped, completed, reset = dc.update(2)
-        desired = stepped is True and completed is False and reset is False
-        self.assertTrue(desired)
-        self.assertTrue(not dc.completed)
-        self.assertTrue(dc.current_seq == [1, 2])
-
-        stepped, completed, reset = dc.update(3)
-        desired = stepped is True and completed is True and reset is False
-        self.assertTrue(desired)
-        self.assertTrue(dc.completed)  # Completed!
-        self.assertTrue(dc.current_seq == [1, 2, 3])
-
-    def test_example_progression_unequal_three_mid_and_reset(self):
-        cset = [[1, 2, 3], [1, 2, 4, 5], [1, 2, 5]]
-
-        dc = DisjunctiveConstraint(cset)
-
-        stepped, completed, reset = dc.update(1)
-        self.assertTrue(not dc.completed)
-        self.assertTrue(dc.current_seq == [1])
-
-        stepped, completed, reset = dc.update(2)
-        self.assertTrue(not dc.completed)
-        self.assertTrue(dc.current_seq == [1, 2])
-
-        stepped, completed, reset = dc.update(4)
-        self.assertTrue(not dc.completed)
-        self.assertTrue(dc.current_seq == [1, 2, 4])
-
-        stepped, completed, reset = dc.update(5)
-        self.assertTrue(dc.completed)  # Completed!
-        self.assertTrue(dc.current_seq == [1, 2, 4, 5])
-
-        dc.reset()
-
-        stepped, completed, reset = dc.update(1)
-        self.assertTrue(not dc.completed)
-        self.assertTrue(dc.remaining() == 3)
-        self.assertTrue(dc.current_seq == [1])
-
-        stepped, completed, reset = dc.update(2)
-        self.assertTrue(not dc.completed)
-        self.assertTrue(dc.remaining() == 2)
-        self.assertTrue(dc.current_seq == [1, 2])
-
-        stepped, completed, reset = dc.update(5)
-        self.assertTrue(dc.completed)  # Completed!
-        self.assertTrue(dc.remaining() == 0)
-        self.assertTrue(dc.current_seq == [1, 2, 5])
diff --git a/tests/transformers/tests/generation/test_beam_search.py b/tests/transformers/tests/generation/test_beam_search.py
deleted file mode 100644
index a93556e025..0000000000
--- a/tests/transformers/tests/generation/test_beam_search.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch
-
-from ..test_modeling_common import floats_tensor, ids_tensor, torch_device
-
-
-assert torch_device == "hpu"
-if is_torch_available():
-    import torch
-    from transformers.generation import (
-        BeamHypotheses,
-        BeamSearchScorer,
-        ConstrainedBeamSearchScorer,
-        DisjunctiveConstraint,
-        PhrasalConstraint,
-    )
-
-
-class BeamSearchTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        sequence_length=10,
-        vocab_size=99,
-        pad_token_id=0,
-        max_length=20,
-        num_beams=4,
-        length_penalty=2.0,
-        do_early_stopping=True,
-        num_beam_hyps_to_keep=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.sequence_length = sequence_length
-        self.vocab_size = vocab_size
-        self.pad_token_id = pad_token_id
-        self.max_length = max_length
-        self.num_beams = num_beams
-        self.length_penalty = length_penalty
-        self.do_early_stopping = do_early_stopping
-        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-
-        # cannot be randomly generated
-        self.eos_token_id = vocab_size + 1
-
-    def prepare_beam_scorer(self, **kwargs):
-        return BeamSearchScorer(
-            batch_size=kwargs.get("batch_size", self.batch_size),
-            num_beams=kwargs.get("num_beams", self.num_beams),
-            device=torch_device,
-            length_penalty=kwargs.get("length_penalty", self.length_penalty),
-            do_early_stopping=kwargs.get("do_early_stopping", self.do_early_stopping),
-            num_beam_hyps_to_keep=kwargs.get("num_beam_hyps_to_keep", self.num_beam_hyps_to_keep),
-        )
-
-    def prepare_inputs(self):
-        input_ids = ids_tensor((self.batch_size * self.num_beams, self.sequence_length), self.vocab_size)
-        next_tokens = ids_tensor((self.batch_size, 2 * self.num_beams), self.vocab_size).to(torch_device)
-        next_indices = ids_tensor((self.batch_size, 2 * self.num_beams), self.num_beams).to(torch_device)
-        next_scores, _ = (-floats_tensor((self.batch_size, 2 * self.num_beams)).to(torch_device)).sort(descending=True)
-        return (input_ids, next_tokens, next_indices, next_scores)
-
-    def check_beam_hypotheses(self, input_ids, *args):
-        # check that correct number of beam hypotheses is set in beam scorer
-        beam_scorer = self.prepare_beam_scorer(do_early_stopping=True)
-        beam_hyp = beam_scorer._beam_hyps[0]
-
-        self.parent.assertEqual(len(beam_scorer._beam_hyps), self.batch_size)
-
-        # check correct type
-        self.parent.assertTrue(isinstance(beam_hyp, BeamHypotheses))
-
-        # check that num_beams is correctly set
-        self.parent.assertEqual(beam_hyp.num_beams, self.num_beams)
-
-        # check for early stopping deactivated
-        for beam_idx in range(self.num_beams):
-            beam_hyp.add(input_ids[beam_idx], -10.0)
-
-        # if early stopping True -> score does not matter
-        self.parent.assertTrue(beam_hyp.is_done(-10.0, 5))
-
-        # re-init
-        beam_scorer = self.prepare_beam_scorer(do_early_stopping=False)
-        beam_hyp = beam_scorer._beam_hyps[0]
-
-        # add `num_beams + 1` beams to change `worst_score`
-        for beam_idx in range(self.num_beams + 1):
-            beam_hyp.add(input_ids[beam_idx], -10.0 + float(beam_idx))
-
-        # -10.0 is removed => -9.0 is worst score
-        self.parent.assertAlmostEqual(beam_hyp.worst_score, -9.0 / (self.sequence_length**beam_hyp.length_penalty))
-
-        # -5.0 is better than worst score => should not be finished
-        self.parent.assertFalse(beam_hyp.is_done(-5.0, self.sequence_length))
-
-        # -20.0 is worse than worst score => should be finished
-        self.parent.assertTrue(beam_hyp.is_done(-20.0, self.sequence_length))
-
-    def check_beam_scorer_update(self, input_ids, next_tokens, next_indices, next_scores):
-        # check too many eos tokens
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[0, :] = self.eos_token_id
-
-        with self.parent.assertRaises(ValueError):
-            beam_scorer.process(input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id)
-
-        # check all batches are done
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[:, : self.num_beams] = self.eos_token_id
-        beam_indices = torch.zeros_like(input_ids) + torch.arange(input_ids.shape[-1], device=input_ids.device)
-        beam_indices = tuple(tuple(b) for b in beam_indices)
-        beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id, beam_indices=beam_indices
-        )
-        # beam scorer should be done
-        self.parent.assertTrue(beam_scorer.is_done)
-
-        # check
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[:, 1] = self.eos_token_id
-        beam_outputs = beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id, beam_indices=beam_indices
-        )
-        output_scores = beam_outputs["next_beam_scores"]
-        output_tokens = beam_outputs["next_beam_tokens"]
-        output_indices = beam_outputs["next_beam_indices"]
-
-        def cut_expected_tensor(tensor):
-            return torch.cat([tensor[:, :1], tensor[:, 2 : self.num_beams + 1]], dim=1).flatten()
-
-        # check all outptus
-        # cut out id of eos token and take best `num_beams` outputs
-        expected_output_tokens = cut_expected_tensor(tokens)
-        expected_output_scores = cut_expected_tensor(next_scores)
-
-        # add num_beams * batch_idx
-        offset = torch.div(
-            torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor"
-        )
-        expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams
-
-        self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())
-        self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist())
-        self.parent.assertTrue(torch.allclose(expected_output_scores, output_scores, atol=1e-3))
-
-        # make sure ids of eos token are correctly saved in beam_hyps of beam scorer
-        expected_beam_indices = list(range(10))
-        for batch_idx in range(self.batch_size):
-            correct_idx = batch_idx * self.num_beams + next_indices[batch_idx, 1]
-            self.parent.assertListEqual(
-                input_ids[correct_idx].tolist(), beam_scorer._beam_hyps[batch_idx].beams[0][1].tolist()
-            )
-            self.parent.assertListEqual(
-                expected_beam_indices + [correct_idx],
-                torch.tensor(beam_scorer._beam_hyps[batch_idx].beams[0][2]).tolist(),
-            )
-
-    def check_beam_scores_finalize(self, input_ids, next_tokens, next_indices, next_scores):
-        # max_length should be only one more than current input_ids to check that eos is correctly appended
-        max_length = self.sequence_length + 1
-        beam_scorer = self.prepare_beam_scorer(num_beam_hyps_to_keep=1, length_penalty=1.0, do_early_stopping=False)
-
-        # update beams and append to input_ids
-        tokens = next_tokens.clone()
-        # first batch, first output has to finish with eos token id since scores are correctly sorted
-        tokens[0, 0] = self.eos_token_id
-        # make sure corresponding score is as good as possible to surely be picked first
-        next_scores[0, 0] = 0.0
-        beam_outputs = beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id
-        )
-        output_scores = beam_outputs["next_beam_scores"]
-        output_tokens = beam_outputs["next_beam_tokens"]
-        output_indices = beam_outputs["next_beam_indices"]
-
-        input_ids = torch.cat([input_ids[output_indices, :], output_tokens.unsqueeze(-1)], dim=-1)
-
-        # finalize
-        beam_indices = torch.zeros_like(input_ids) + torch.arange(input_ids.shape[-1], device=input_ids.device)
-        beam_indices = tuple(tuple(b) for b in beam_indices)
-        sequence_output = beam_scorer.finalize(
-            input_ids,
-            output_scores,
-            output_tokens,
-            output_indices,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-            max_length=max_length,
-            beam_indices=beam_indices,
-        )
-
-        sequences = sequence_output["sequences"]
-        sequence_scores = sequence_output["sequence_scores"]
-
-        # since `num_beam_hyps_to_keep` = 1 => only return `batch_size` x `max_length`
-        self.parent.assertListEqual(list(sequences.shape), [self.batch_size, max_length])
-        self.parent.assertListEqual(list(sequence_scores.shape), [self.batch_size])
-
-        # check sequence_scores
-        self.parent.assertFalse((sequence_scores > 0).any().item())
-
-        # first batch has to finish with eos_token
-        self.parent.assertEqual(sequences[0, -1].item(), self.eos_token_id)
-
-        # other batches cannot finish with eos token
-        self.parent.assertNotEqual(sequences[1, -1].item(), self.eos_token_id)
-        self.parent.assertNotEqual(sequences[2, -1].item(), self.eos_token_id)
-
-        # now test that if `num_beam_hyps_to_keep` is 3 => all beams are returned
-        beam_scorer.num_beam_hyps_to_keep = self.num_beams
-        sequence_output = beam_scorer.finalize(
-            input_ids,
-            output_scores,
-            output_tokens,
-            output_indices,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-            max_length=max_length,
-            beam_indices=beam_indices,
-        )
-        sequences = sequence_output["sequences"]
-        sequence_scores = sequence_output["sequence_scores"]
-
-        self.parent.assertListEqual(list(sequences.shape), [self.num_beams * self.batch_size, max_length])
-        self.parent.assertListEqual(list(sequence_scores.shape), [self.num_beams * self.batch_size])
-
-
-class ConstrainedBeamSearchTester:
-    def __init__(
-        self,
-        parent,
-        constraints=None,
-        batch_size=3,
-        sequence_length=10,
-        vocab_size=99,
-        pad_token_id=0,
-        max_length=20,
-        num_beams=4,
-        length_penalty=2.0,
-        do_early_stopping=True,
-        num_beam_hyps_to_keep=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.sequence_length = sequence_length
-        self.vocab_size = vocab_size
-        self.pad_token_id = pad_token_id
-        self.max_length = max_length
-        self.num_beams = num_beams
-        self.length_penalty = length_penalty
-        self.do_early_stopping = do_early_stopping
-        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-
-        if constraints is None:
-            force_tokens = torch.randint(10, 50, (1, 2))[0].tolist()
-            disjunctive_tokens = torch.randint(10, 50, (2, 2)).tolist()
-
-            constraints = [PhrasalConstraint(force_tokens), DisjunctiveConstraint(disjunctive_tokens)]
-            self.constraints = constraints
-        # cannot be randomly generated
-        self.eos_token_id = vocab_size + 1
-
-    def prepare_constrained_beam_scorer(self, **kwargs):
-        return ConstrainedBeamSearchScorer(
-            constraints=kwargs.get("constraints", self.constraints),
-            batch_size=kwargs.get("batch_size", self.batch_size),
-            num_beams=kwargs.get("num_beams", self.num_beams),
-            device=torch_device,
-            length_penalty=kwargs.get("length_penalty", self.length_penalty),
-            do_early_stopping=kwargs.get("do_early_stopping", self.do_early_stopping),
-            num_beam_hyps_to_keep=kwargs.get("num_beam_hyps_to_keep", self.num_beam_hyps_to_keep),
-        )
-
-    def prepare_inputs(self):
-        input_ids = ids_tensor((self.batch_size * self.num_beams, self.sequence_length), self.vocab_size)
-        next_tokens = ids_tensor((self.batch_size, 2 * self.num_beams), self.vocab_size).to(torch_device)
-        next_indices = ids_tensor((self.batch_size, 2 * self.num_beams), self.num_beams).to(torch_device)
-        next_scores, _ = (-floats_tensor((self.batch_size, 2 * self.num_beams)).to(torch_device)).sort(descending=True)
-        scores_for_all_vocab, _ = (
-            -floats_tensor((self.batch_size * self.num_beams, self.vocab_size)).to(torch_device)
-        ).sort(descending=True)
-        return (input_ids, next_tokens, next_indices, next_scores, scores_for_all_vocab)
-
-    def check_beam_hypotheses(self, input_ids, *args):
-        # check that correct number of beam hypotheses is set in beam scorer
-        constrained_beam_scorer = self.prepare_constrained_beam_scorer(do_early_stopping=True)
-        beam_hyp = constrained_beam_scorer._beam_hyps[0]
-
-        self.parent.assertEqual(len(constrained_beam_scorer._beam_hyps), self.batch_size)
-
-        # check correct type
-        self.parent.assertTrue(isinstance(beam_hyp, BeamHypotheses))
-
-        # check that num_beams is correctly set
-        self.parent.assertEqual(beam_hyp.num_beams, self.num_beams)
-
-        # check for early stopping deactivated
-        for beam_idx in range(self.num_beams):
-            beam_hyp.add(input_ids[beam_idx], -10.0)
-
-        # if early stopping True -> score does not matter
-        self.parent.assertTrue(beam_hyp.is_done(-10.0, 5))
-
-        # re-init
-        constrained_beam_scorer = self.prepare_constrained_beam_scorer(do_early_stopping=False)
-        beam_hyp = constrained_beam_scorer._beam_hyps[0]
-
-        # add `num_beams + 1` beams to change `worst_score`
-        for beam_idx in range(self.num_beams + 1):
-            beam_hyp.add(input_ids[beam_idx], -10.0 + float(beam_idx))
-
-        # -10.0 is removed => -9.0 is worst score
-        self.parent.assertAlmostEqual(beam_hyp.worst_score, -9.0 / (self.sequence_length**beam_hyp.length_penalty))
-
-        # -5.0 is better than worst score => should not be finished
-        self.parent.assertFalse(beam_hyp.is_done(-5.0, self.sequence_length))
-
-        # -20.0 is worse than worst score => should be finished
-        self.parent.assertTrue(beam_hyp.is_done(-20.0, self.sequence_length))
-
-    def check_constrained_beam_scorer_update(
-        self, input_ids, next_tokens, next_indices, next_scores, scores_for_all_vocab
-    ):
-        # check too many eos tokens
-        constrained_beam_scorer = self.prepare_constrained_beam_scorer()
-        stacked_token_ids = []
-        for constraint in self.constraints:
-            token_ids = constraint.token_ids
-            token_ids = token_ids[0] if isinstance(token_ids[0], list) else token_ids
-            stacked_token_ids = stacked_token_ids + token_ids
-
-        fulfilling_sequence = torch.LongTensor(stacked_token_ids).to(torch_device)
-        fulfill_len = fulfilling_sequence.size(0)
-        input_ids[:, :fulfill_len] = fulfilling_sequence
-
-        tokens = next_tokens.clone()
-        tokens[0, :] = self.eos_token_id
-
-        with self.parent.assertRaises(ValueError):
-            constrained_beam_scorer.process(
-                input_ids, next_scores, tokens, next_indices, scores_for_all_vocab, eos_token_id=self.eos_token_id
-            )
-
-        # check all batches are done
-        constrained_beam_scorer = self.prepare_constrained_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[:, : self.num_beams] = self.eos_token_id
-        constrained_beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, scores_for_all_vocab, eos_token_id=self.eos_token_id
-        )
-        # beam scorer should be done
-        self.parent.assertTrue(constrained_beam_scorer.is_done)
-
-        # check
-        constrained_beam_scorer = self.prepare_constrained_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[:, 1] = self.eos_token_id
-        beam_outputs = constrained_beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, scores_for_all_vocab, eos_token_id=self.eos_token_id
-        )
-        output_scores = beam_outputs["next_beam_scores"]
-        output_tokens = beam_outputs["next_beam_tokens"]
-        output_indices = beam_outputs["next_beam_indices"]
-
-        def cut_expected_tensor(tensor):
-            return torch.cat([tensor[:, :1], tensor[:, 2 : self.num_beams + 1]], dim=1).flatten()
-
-        # check all outptus
-        # cut out id of eos token and take best `num_beams` outputs
-        expected_output_tokens = cut_expected_tensor(tokens)
-        expected_output_scores = cut_expected_tensor(next_scores)
-
-        # add num_beams * batch_idx
-        offset = torch.div(
-            torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor"
-        )
-        expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams
-
-        self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())
-        self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist())
-        self.parent.assertTrue(torch.allclose(expected_output_scores, output_scores, atol=1e-3))
-
-        # make sure ids of eos token are correctly saved in beam_hyps of beam scorer
-        for batch_idx in range(self.batch_size):
-            correct_idx = batch_idx * self.num_beams + next_indices[batch_idx, 1]
-            self.parent.assertListEqual(
-                input_ids[correct_idx].tolist(), constrained_beam_scorer._beam_hyps[batch_idx].beams[0][1].tolist()
-            )
-
-    def check_constrained_beam_scorer_finalize(
-        self, input_ids, next_tokens, next_indices, next_scores, scores_for_all_vocab
-    ):
-        # max_length should be only one more than current input_ids to check that eos is correctly appended
-        max_length = self.sequence_length + 1
-
-        # for testing finalize, we do want to have fulfilled constraints
-        stacked_token_ids = []
-        for constraint in self.constraints:
-            token_ids = constraint.token_ids
-            token_ids = token_ids[0] if isinstance(token_ids[0], list) else token_ids
-            stacked_token_ids = stacked_token_ids + token_ids
-
-        fulfilling_sequence = torch.LongTensor(stacked_token_ids).to(torch_device)
-
-        fulfill_len = fulfilling_sequence.size(0)
-        input_ids[:, :fulfill_len] = fulfilling_sequence
-
-        constrained_beam_scorer = self.prepare_constrained_beam_scorer(
-            num_beam_hyps_to_keep=1, length_penalty=1.0, do_early_stopping=False
-        )
-
-        constraints = constrained_beam_scorer.constraints
-        # update beams and append to input_ids
-        tokens = next_tokens.clone()
-        # first batch, first output has to finish with eos token id since scores are correctly sorted
-        tokens[0, 0] = self.eos_token_id
-        # make sure corresponding score is as good as possible to surely be picked first
-        next_scores[0, 0] = 0.0
-
-        beam_outputs = constrained_beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, scores_for_all_vocab, eos_token_id=self.eos_token_id
-        )
-        output_scores = beam_outputs["next_beam_scores"]
-        output_tokens = beam_outputs["next_beam_tokens"]
-        output_indices = beam_outputs["next_beam_indices"]
-        input_ids = torch.cat([input_ids[output_indices, :], output_tokens.unsqueeze(-1)], dim=-1)
-
-        # finalize
-        sequence_output = constrained_beam_scorer.finalize(
-            input_ids,
-            output_scores,
-            output_tokens,
-            output_indices,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-            max_length=max_length,
-        )
-
-        sequences = sequence_output["sequences"]
-        sequence_scores = sequence_output["sequence_scores"]
-
-        # since `num_beam_hyps_to_keep` = 1 => only return `batch_size` x `max_length`
-        self.parent.assertListEqual(list(sequences.shape), [self.batch_size, max_length])
-        self.parent.assertListEqual(list(sequence_scores.shape), [self.batch_size])
-
-        # check sequence_scores
-        self.parent.assertFalse((sequence_scores > 0).any().item())
-
-        # first batch has to finish with eos_token
-        self.parent.assertEqual(sequences[0, -1].item(), self.eos_token_id)
-
-        # other batches cannot finish with eos token
-        self.parent.assertNotEqual(sequences[1, -1].item(), self.eos_token_id)
-        self.parent.assertNotEqual(sequences[2, -1].item(), self.eos_token_id)
-
-        # test that the constraint is indeed fulfilled
-        for output, constraint in [(s, c) for s in sequences for c in constraints]:
-            forced_token_ids = constraint.token_ids
-            if isinstance(forced_token_ids[0], list):
-                # disjunctive case
-                flag = False
-                for token_ids in forced_token_ids:
-                    if self._check_sequence_inside_sequence(output, token_ids):
-                        flag = True
-                        break
-                self.parent.assertEqual(flag, True)
-            else:
-                self.parent.assertEqual(self._check_sequence_inside_sequence(output, forced_token_ids), True)
-
-        # now test that if `num_beam_hyps_to_keep` is 3 => all beams are returned
-
-        # constrained_beam_scorer.num_beam_hyps_to_keep = self.num_beams
-        constrained_beam_scorer = self.prepare_constrained_beam_scorer(
-            num_beam_hyps_to_keep=self.num_beams, length_penalty=1.0, do_early_stopping=False
-        )
-
-        sequence_output = constrained_beam_scorer.finalize(
-            input_ids,
-            output_scores,
-            output_tokens,
-            output_indices,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-            max_length=max_length,
-        )
-        sequences = sequence_output["sequences"]
-        sequence_scores = sequence_output["sequence_scores"]
-
-        self.parent.assertListEqual(list(sequences.shape), [self.num_beams * self.batch_size, max_length])
-        self.parent.assertListEqual(list(sequence_scores.shape), [self.num_beams * self.batch_size])
-
-    def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
-        # check if tensor_1 inside tensor_2 or tensor_2 inside tensor_1.
-        # set to same device. we don't care what device.
-
-        if not isinstance(tensor_1, list):
-            tensor_1 = tensor_1.cpu().tolist()
-        if not isinstance(tensor_2, list):
-            tensor_2 = tensor_2.cpu().tolist()
-
-        in_order = len(tensor_1) <= len(tensor_2)
-        longer = tensor_2 if in_order else tensor_1
-        shorter = tensor_1 if in_order else tensor_2
-
-        flag = False
-        chunk_size = len(shorter)
-        for chunk_idx in range(len(longer) - chunk_size + 1):
-            subseq = longer[chunk_idx : chunk_idx + chunk_size]
-            if subseq == shorter:
-                flag = True
-                break
-
-        return flag
-
-
-@require_torch
-class BeamSearchTest(unittest.TestCase):
-    def setUp(self):
-        self.beam_search_tester = BeamSearchTester(self)
-
-    def test_beam_hypotheses(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_hypotheses(*inputs)
-
-    def test_beam_scorer_update(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_scorer_update(*inputs)
-
-    def test_beam_scorer_finalize(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_scores_finalize(*inputs)
-
-
-@require_torch
-class ConstrainedBeamSearchTest(unittest.TestCase):
-    def setUp(self):
-        self.constrained_beam_search_tester = ConstrainedBeamSearchTester(self)
-
-    def test_constrained_beam_hypotheses(self):
-        inputs = self.constrained_beam_search_tester.prepare_inputs()
-        self.constrained_beam_search_tester.check_beam_hypotheses(*inputs)
-
-    def test_constrained_beam_scorer_update(self):
-        inputs = self.constrained_beam_search_tester.prepare_inputs()
-        self.constrained_beam_search_tester.check_constrained_beam_scorer_update(*inputs)
-
-    def test_constrained_beam_scorer_finalize(self):
-        inputs = self.constrained_beam_search_tester.prepare_inputs()
-        self.constrained_beam_search_tester.check_constrained_beam_scorer_finalize(*inputs)
diff --git a/tests/transformers/tests/generation/test_configuration_utils.py b/tests/transformers/tests/generation/test_configuration_utils.py
deleted file mode 100644
index 65d9e1082b..0000000000
--- a/tests/transformers/tests/generation/test_configuration_utils.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import os
-import tempfile
-import unittest
-import warnings
-
-from huggingface_hub import HfFolder, delete_repo
-from parameterized import parameterized
-from requests.exceptions import HTTPError
-from transformers import AutoConfig, GenerationConfig
-from transformers.testing_utils import TOKEN, USER, is_staging_test
-
-
-class GenerationConfigTest(unittest.TestCase):
-    @parameterized.expand([(None,), ("foo.json",)])
-    def test_save_load_config(self, config_name):
-        config = GenerationConfig(
-            do_sample=True,
-            temperature=0.7,
-            length_penalty=1.0,
-            bad_words_ids=[[1, 2, 3], [4, 5]],
-        )
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(tmp_dir, config_name=config_name)
-            loaded_config = GenerationConfig.from_pretrained(tmp_dir, config_name=config_name)
-
-        # Checks parameters that were specified
-        self.assertEqual(loaded_config.do_sample, True)
-        self.assertEqual(loaded_config.temperature, 0.7)
-        self.assertEqual(loaded_config.length_penalty, 1.0)
-        self.assertEqual(loaded_config.bad_words_ids, [[1, 2, 3], [4, 5]])
-
-        # Checks parameters that were not specified (defaults)
-        self.assertEqual(loaded_config.top_k, 50)
-        self.assertEqual(loaded_config.max_length, 20)
-        self.assertEqual(loaded_config.max_time, None)
-
-    def test_from_model_config(self):
-        model_config = AutoConfig.from_pretrained("gpt2")
-        generation_config_from_model = GenerationConfig.from_model_config(model_config)
-        default_generation_config = GenerationConfig()
-
-        # The generation config has loaded a few non-default parameters from the model config
-        self.assertNotEqual(generation_config_from_model, default_generation_config)
-
-        # One of those parameters is eos_token_id -- check if it matches
-        self.assertNotEqual(generation_config_from_model.eos_token_id, default_generation_config.eos_token_id)
-        self.assertEqual(generation_config_from_model.eos_token_id, model_config.eos_token_id)
-
-    def test_update(self):
-        generation_config = GenerationConfig()
-        update_kwargs = {
-            "max_new_tokens": 1024,
-            "foo": "bar",
-        }
-        update_kwargs_copy = copy.deepcopy(update_kwargs)
-        unused_kwargs = generation_config.update(**update_kwargs)
-
-        # update_kwargs was not modified (no side effects)
-        self.assertEqual(update_kwargs, update_kwargs_copy)
-
-        # update_kwargs was used to update the config on valid attributes
-        self.assertEqual(generation_config.max_new_tokens, 1024)
-
-        # `.update()` returns a dictionary of unused kwargs
-        self.assertEqual(unused_kwargs, {"foo": "bar"})
-
-    def test_initialize_new_kwargs(self):
-        generation_config = GenerationConfig()
-        generation_config.foo = "bar"
-
-        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
-            generation_config.save_pretrained(tmp_dir)
-
-            new_config = GenerationConfig.from_pretrained(tmp_dir)
-        # update_kwargs was used to update the config on valid attributes
-        self.assertEqual(new_config.foo, "bar")
-
-        generation_config = GenerationConfig.from_model_config(new_config)
-        assert not hasattr(generation_config, "foo")  # no new kwargs should be initialized if from config
-
-    def test_kwarg_init(self):
-        """Tests that we can overwrite attributes at `from_pretrained` time."""
-        default_config = GenerationConfig()
-        self.assertEqual(default_config.temperature, 1.0)
-        self.assertEqual(default_config.do_sample, False)
-        self.assertEqual(default_config.num_beams, 1)
-
-        config = GenerationConfig(
-            do_sample=True,
-            temperature=0.7,
-            length_penalty=1.0,
-            bad_words_ids=[[1, 2, 3], [4, 5]],
-        )
-        self.assertEqual(config.temperature, 0.7)
-        self.assertEqual(config.do_sample, True)
-        self.assertEqual(config.num_beams, 1)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(tmp_dir)
-            loaded_config = GenerationConfig.from_pretrained(tmp_dir, temperature=1.0)
-
-        self.assertEqual(loaded_config.temperature, 1.0)
-        self.assertEqual(loaded_config.do_sample, True)
-        self.assertEqual(loaded_config.num_beams, 1)  # default value
-
-    def test_refuse_to_save(self):
-        """Tests that we refuse to save a generation config that fails validation."""
-
-        # setting the temperature alone is invalid, as we also need to set do_sample to True -> throws a warning that
-        # is caught, doesn't save, and raises an exception
-        config = GenerationConfig()
-        config.temperature = 0.5
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(ValueError) as exc:
-                config.save_pretrained(tmp_dir)
-            self.assertTrue("Fix these issues to save the configuration." in str(exc.exception))
-            self.assertTrue(len(os.listdir(tmp_dir)) == 0)
-
-        # greedy decoding throws an exception if we try to return multiple sequences -> throws an exception that is
-        # caught, doesn't save, and raises a warning
-        config = GenerationConfig()
-        config.num_return_sequences = 2
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(ValueError) as exc:
-                config.save_pretrained(tmp_dir)
-            self.assertTrue("Fix these issues to save the configuration." in str(exc.exception))
-            self.assertTrue(len(os.listdir(tmp_dir)) == 0)
-
-        # final check: no warnings thrown if it is correct, and file is saved
-        config = GenerationConfig()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with warnings.catch_warnings(record=True) as captured_warnings:
-                config.save_pretrained(tmp_dir)
-            self.assertEqual(len(captured_warnings), 0)
-            self.assertTrue(len(os.listdir(tmp_dir)) == 1)
-
-
-@is_staging_test
-class ConfigPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-generation-config")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-generation-config-org")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = GenerationConfig(
-            do_sample=True,
-            temperature=0.7,
-            length_penalty=1.0,
-        )
-        config.push_to_hub("test-generation-config", token=self._token)
-
-        new_config = GenerationConfig.from_pretrained(f"{USER}/test-generation-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-generation-config")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(tmp_dir, repo_id="test-generation-config", push_to_hub=True, token=self._token)
-
-        new_config = GenerationConfig.from_pretrained(f"{USER}/test-generation-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_in_organization(self):
-        config = GenerationConfig(
-            do_sample=True,
-            temperature=0.7,
-            length_penalty=1.0,
-        )
-        config.push_to_hub("valid_org/test-generation-config-org", token=self._token)
-
-        new_config = GenerationConfig.from_pretrained("valid_org/test-generation-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-generation-config-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-generation-config-org", push_to_hub=True, token=self._token
-            )
-
-        new_config = GenerationConfig.from_pretrained("valid_org/test-generation-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
diff --git a/tests/transformers/tests/generation/test_framework_agnostic.py b/tests/transformers/tests/generation/test_framework_agnostic.py
deleted file mode 100644
index 7fcc4de752..0000000000
--- a/tests/transformers/tests/generation/test_framework_agnostic.py
+++ /dev/null
@@ -1,687 +0,0 @@
-"""
-Framework agnostic tests for generate()-related methods.
-"""
-
-import numpy as np
-from transformers import AutoTokenizer
-from transformers.testing_utils import slow, torch_device
-
-
-class GenerationIntegrationTestsMixin:
-    # To be populated by the child classes
-    framework_dependent_parameters = {
-        "AutoModelForCausalLM": None,
-        "AutoModelForSpeechSeq2Seq": None,
-        "AutoModelForSeq2SeqLM": None,
-        "AutoModelForVision2Seq": None,
-        "LogitsProcessorList": None,
-        "MinLengthLogitsProcessor": None,
-        "create_tensor_fn": None,
-        "floats_tensor": None,
-        "return_tensors": None,
-        "set_seed": None,
-    }
-
-    def test_validate_generation_inputs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        encoder_input_str = "Hello world"
-        input_ids = tokenizer(encoder_input_str, return_tensors=return_tensors).input_ids
-
-        # typos are quickly detected (the correct argument is `do_sample`)
-        with self.assertRaisesRegex(ValueError, "do_samples"):
-            model.generate(input_ids, do_samples=True)
-
-        # arbitrary arguments that will not be used anywhere are also not accepted
-        with self.assertRaisesRegex(ValueError, "foo"):
-            fake_model_kwargs = {"foo": "bar"}
-            model.generate(input_ids, **fake_model_kwargs)
-
-        # however, valid model_kwargs are accepted
-        valid_model_kwargs = {"attention_mask": create_tensor_fn(np.zeros_like(input_ids))}
-        model.generate(input_ids, **valid_model_kwargs)
-
-    def test_custom_logits_processor(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        logits_processor_list_cls = self.framework_dependent_parameters["LogitsProcessorList"]
-        min_length_logits_processor_cls = self.framework_dependent_parameters["MinLengthLogitsProcessor"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", min_length=1)
-        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
-
-        logits_processor = logits_processor_list_cls()
-        logits_processor.append(min_length_logits_processor_cls(min_length=10, eos_token_id=0))
-        # it should not be allowed to both define `min_length` via config and `logits_processor` list
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, logits_processor=logits_processor)
-
-        bart_model.config.min_length = None
-        bart_model.generate(input_ids, logits_processor=logits_processor)
-
-    def test_max_new_tokens_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart")
-        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            bart_model = bart_model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 29])
-
-        max_new_tokens = 3
-        bart_model.config.max_length = 20
-        bart_model.config.eos_token_id = None
-
-        # Encoder decoder call
-        outputs = bart_model.generate(input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 4])
-
-        # Decoder only call
-        outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 29 (input length) + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 33])
-
-        # Encoder decoder call > 20
-        outputs = bart_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS + 20 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-    def test_max_new_tokens_decoder_only(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake."""
-        gpt2_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = gpt2_tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            gpt2_model = gpt2_model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 9])
-
-        max_new_tokens = 3
-        gpt2_model.config.max_length = 20
-
-        # call < 20
-        outputs = gpt2_model.generate(input_ids, max_new_tokens=max_new_tokens)
-
-        # 9 input_ids + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 12])
-
-        # call > 20
-        outputs = gpt2_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS token + 23 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-    def test_encoder_decoder_generate_with_inputs_embeds(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        output_sequences = model.generate(inputs_embeds=inputs_embeds)
-
-        # make sure model generated correctly until `max_length`
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_transition_scores_greedy_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilgpt2", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained("distilgpt2")
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=5,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-
-        expected_scores = np.array(
-            [
-                [-57.8844, -60.45698, -70.16364, -65.50791, -66.35648],
-                [-54.417572, -60.216614, -62.661243, -58.621933, -58.298683],
-            ]
-        )
-        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
-
-    def test_transition_scores_greedy_search_normalized(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilgpt2", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained("distilgpt2")
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=5,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-
-        expected_scores = np.array(
-            [
-                [-2.538938, -2.2694316, -2.1580915, -1.572299, -2.6719835],
-                [-1.8826028, -2.2461371, -1.7556462, -2.9644494, -1.7996008],
-            ]
-        )
-        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_decoder_only(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake",
-            "Michael Phelps",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_sample_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            do_sample=True,
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    @slow
-    def test_transition_scores_early_stopping(self):
-        # This is an aggressive test that makes sure that `beam_search's`
-        # transition scores are computed correctly for varying `num_return_sequences`, `num_beams` and `batch_size > 1`
-        # 2 x input_ids for "question: How are you? \n context: I had a long day, "
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_ids = create_tensor_fn(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]])
-        model = model_cls.from_pretrained("t5-small")
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids,
-            max_length=10,
-            return_dict_in_generate=True,
-            output_scores=True,
-            forced_eos_token_id=model.config.eos_token_id,
-            num_beams=4,
-            do_sample=False,
-            num_return_sequences=3,
-            length_penalty=0.0,
-        )
-
-        transition_scores = model.compute_transition_scores(
-            sequences=outputs.sequences, scores=outputs.scores, beam_indices=outputs.beam_indices
-        )
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores))
-
-    def test_encoder_decoder_generate_attention_mask(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Timberlake", "Jessica Biel, welcome to parenthood among other things"]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        # need extreme generation values here to force this test
-        # to fail when `attention_mask` is not correctly treated in generate
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart", max_length=50, num_beams=5, num_return_sequences=5
-        )
-        model.config.eos_token_id = None
-        input_ids = tokenizer(articles[0], return_tensors=return_tensors).input_ids
-        input_ids_batched = tokenizer(articles, padding=True, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-            input_ids_batched = input_ids_batched.to(torch_device)
-
-        output_sequences_batched = model.generate(
-            input_ids=input_ids_batched, return_dict_in_generate=True, output_scores=True
-        )
-        output_sequences = model.generate(input_ids=input_ids, return_dict_in_generate=True, output_scores=True)
-
-        batched_out = output_sequences_batched.sequences_scores
-        out = output_sequences.sequences_scores
-        if is_pt:
-            batched_out = batched_out.cpu().numpy()
-            out = out.cpu().numpy()
-
-        diff = np.abs(np.sum(batched_out[:5]) - np.sum(out))
-        self.assertTrue(diff < 1e-4)
-
-    def test_generate_input_ids_as_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (1, 15))
-
-    def test_generate_input_ids_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_generate_inputs_and_encoder_kwargs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, input_ids=input_ids)
-
-    def test_generate_too_many_encoder_kwargs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=10)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        with self.assertRaises(ValueError):
-            model.generate(input_ids=input_ids, inputs_embeds=input_ids)
-
-    def test_generate_input_features_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_features = floats_tensor((3, 80, 60))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
-        if is_pt:
-            input_features.to(torch_device)
-            model = model.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_features=input_features, max_length=5)
-        output_sequences = model.generate(input_features, max_length=5)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (3, 5))
-
-    def test_generate_pixel_values_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
-        model.config.decoder.eos_token_id = None
-        if is_pt:
-            pixel_values = pixel_values.to(torch_device)
-            model = model.to(torch_device)
-
-        output_sequences_kwargs = model.generate(pixel_values=pixel_values, max_length=5)
-        output_sequences = model.generate(pixel_values, max_length=5)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (2, 5))
-
-    def test_generate_encoder_outputs_attention_mask(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_features = floats_tensor((3, 80, 60))
-        attention_mask = create_tensor_fn(np.ones(input_features.shape))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
-        if is_pt:
-            input_features = input_features.to(torch_device)
-            attention_mask = attention_mask.to(torch_device)
-            model = model.to(torch_device)
-
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(input_features)
-
-        output_sequences_no_mask = model.generate(encoder_outputs=encoder_outputs)
-        output_sequences_with_mask = model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)
-        if is_pt:
-            output_sequences_no_mask = output_sequences_no_mask.cpu().numpy()
-            output_sequences_with_mask = output_sequences_with_mask.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences_no_mask, output_sequences_with_mask))
-
-    def test_eos_token_id_int_and_list_greedy_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-        }
-        expectation = 13
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [873, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_contrastive_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-            "penalty_alpha": 0.6,
-            "top_k": 4,
-        }
-        expectation = 17
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 225
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [225, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_beam_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 3,
-        }
-        expectation = 13
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        unpadded_correct_condition = expectation == len(generated_tokens[0])
-        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
-
-        eos_token_id = [873, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        unpadded_correct_condition = expectation == len(generated_tokens[0])
-        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
-
-    def test_generate_vision2text_conditioning(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        conditioning_input = create_tensor_fn([[10], [10]])  # this should be the 2nd output token, after the BOS token
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
-        if is_pt:
-            pixel_values = pixel_values.to(torch_device)
-            model = model.to(torch_device)
-            conditioning_input = conditioning_input.to(torch_device)
-
-        # we can condition on decoder_input_ids (expected decoder input) and input_ids (which we pipe internally as
-        # decoder_input_ids, if the encoder is not a model with text input)
-        output_sequences_decoder_input_ids = model.generate(
-            pixel_values, max_length=5, decoder_input_ids=conditioning_input
-        )
-        output_sequences_input_ids = model.generate(pixel_values, max_length=5, input_ids=conditioning_input)
-        if is_pt:
-            output_sequences_decoder_input_ids = output_sequences_decoder_input_ids.cpu().numpy()
-            output_sequences_input_ids = output_sequences_input_ids.cpu().numpy()
-            conditioning_input = conditioning_input.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
-        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
diff --git a/tests/transformers/tests/generation/test_logits_process.py b/tests/transformers/tests/generation/test_logits_process.py
deleted file mode 100644
index d40a94d38f..0000000000
--- a/tests/transformers/tests/generation/test_logits_process.py
+++ /dev/null
@@ -1,931 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from typing import List, Union
-
-from parameterized import parameterized
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch
-
-from ..test_modeling_common import ids_tensor, torch_device
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-    from transformers.generation import (
-        EncoderNoRepeatNGramLogitsProcessor,
-        EncoderRepetitionPenaltyLogitsProcessor,
-        EpsilonLogitsWarper,
-        EtaLogitsWarper,
-        ExponentialDecayLengthPenalty,
-        ForcedBOSTokenLogitsProcessor,
-        ForcedEOSTokenLogitsProcessor,
-        HammingDiversityLogitsProcessor,
-        InfNanRemoveLogitsProcessor,
-        LogitNormalization,
-        LogitsProcessorList,
-        MinLengthLogitsProcessor,
-        MinNewTokensLengthLogitsProcessor,
-        NoBadWordsLogitsProcessor,
-        NoRepeatNGramLogitsProcessor,
-        PrefixConstrainedLogitsProcessor,
-        RepetitionPenaltyLogitsProcessor,
-        SequenceBiasLogitsProcessor,
-        TemperatureLogitsWarper,
-        TopKLogitsWarper,
-        TopPLogitsWarper,
-        TypicalLogitsWarper,
-        UnbatchedClassifierFreeGuidanceLogitsProcessor,
-    )
-    from transformers.generation.logits_process import BarkEosPrioritizerLogitsProcessor
-
-
-@require_torch
-class LogitsProcessorTest(unittest.TestCase):
-    def _get_uniform_logits(self, batch_size: int, length: int):
-        scores = torch.ones((batch_size, length), device=torch_device, dtype=torch.float) / length
-        return scores
-
-    def test_logits_processor_expected_device(self):
-        EXPECTED_DEVICE_TYPE = "hpu"
-        batch_size = 4
-        sequence_length = 10
-        vocab_size = 15
-        eos_token_id = 0
-        min_eos_p = 0.1  ## some small float
-
-        # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        processors = [
-            MinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id),
-            TemperatureLogitsWarper(temperature=0.5),
-            RepetitionPenaltyLogitsProcessor(penalty=2.0),
-            TopKLogitsWarper(3),
-            TopPLogitsWarper(0.8),
-            NoRepeatNGramLogitsProcessor(2),
-            NoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id),
-            BarkEosPrioritizerLogitsProcessor(eos_token_id=eos_token_id, min_eos_p=min_eos_p),
-        ]
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        self.assertTrue(scores.device.type == EXPECTED_DEVICE_TYPE)
-        for processor in processors:
-            scores = processor(input_ids, scores)
-            self.assertTrue(scores.device.type == EXPECTED_DEVICE_TYPE)
-
-    def test_min_length_dist_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-
-        min_dist_processor = MinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-
-        # check that min length is applied at length 5
-        input_ids = ids_tensor((batch_size, 5), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].tolist(), 4 * [-float("inf")])
-
-        # check that min length is not applied anymore at length 15
-        input_ids = ids_tensor((batch_size, 15), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores)
-        self.assertFalse(torch.isinf(scores_before_min_length).any())
-
-    @parameterized.expand([(0,), ([0, 18],)])
-    def test_new_min_length_dist_processor(self, eos_token_id: Union[int, List[int]]):
-        vocab_size = 20
-        batch_size = 4
-
-        # check that first input is skipped (min new length applying)
-        input_ids = ids_tensor((batch_size, 5), vocab_size=20)
-        new_min_dist_processor = MinNewTokensLengthLogitsProcessor(
-            prompt_length_to_skip=input_ids.shape[-1], min_new_tokens=3, eos_token_id=eos_token_id
-        )
-
-        expected_eos_scores_before_min_length = batch_size * [-float("inf")]
-        if isinstance(eos_token_id, list):
-            expected_eos_scores_before_min_length *= len(eos_token_id)
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertListEqual(
-            scores_before_min_length[:, eos_token_id].flatten().tolist(), expected_eos_scores_before_min_length
-        )
-
-        # check that, for skipping, now prompt length is 5, after that we expect first 5 tokens will be skipped
-        self.assertTrue(new_min_dist_processor.prompt_length_to_skip == 5)
-
-        # check that min length is applied at length 2
-        input_ids = ids_tensor((batch_size, 2), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertListEqual(
-            scores_before_min_length[:, eos_token_id].flatten().tolist(), expected_eos_scores_before_min_length
-        )
-
-        # check that min new length is applied at length 6 (because it has only 1 new token)
-        input_ids = ids_tensor((batch_size, 6), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertListEqual(
-            scores_before_min_length[:, eos_token_id].flatten().tolist(), expected_eos_scores_before_min_length
-        )
-
-        # check that min new length is applied at length 7 (because it has only 2 new tokens)
-        input_ids = ids_tensor((batch_size, 7), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertListEqual(
-            scores_before_min_length[:, eos_token_id].flatten().tolist(), expected_eos_scores_before_min_length
-        )
-
-        # check that min new length is not applied anymore at length 8
-        input_ids = ids_tensor((batch_size, 8), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertFalse(torch.isinf(scores_before_min_length).any())
-
-        # check that min new length is not applied anymore at length 15
-        input_ids = ids_tensor((batch_size, 15), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = new_min_dist_processor(input_ids, scores)
-        self.assertFalse(torch.isinf(scores_before_min_length).any())
-
-    def test_temperature_dist_warper(self):
-        input_ids = None
-        length = 20
-
-        scores = self._get_uniform_logits(batch_size=2, length=length)
-
-        # tweak scores to not be uniform anymore
-        scores[1, 5] = (1 / length) + 0.1  # peak, 1st batch
-        scores[1, 10] = (1 / length) - 0.4  # valley, 1st batch
-
-        # compute softmax
-        probs = nn.functional.softmax(scores, dim=-1)
-
-        temp_dist_warper_sharper = TemperatureLogitsWarper(temperature=0.5)
-        temp_dist_warper_smoother = TemperatureLogitsWarper(temperature=1.3)
-
-        warped_prob_sharp = nn.functional.softmax(temp_dist_warper_sharper(input_ids, scores), dim=-1)
-        warped_prob_smooth = nn.functional.softmax(temp_dist_warper_smoother(input_ids, scores), dim=-1)
-        processed_scores = temp_dist_warper_smoother(input_ids, scores)
-
-        # uniform distribution stays uniform
-        self.assertTrue(torch.allclose(probs[0, :], warped_prob_sharp[0, :], atol=1e-3))
-        self.assertTrue(torch.allclose(probs[0, :], warped_prob_smooth[0, :], atol=1e-3))
-
-        # sharp peaks get higher, valleys get lower
-        self.assertLess(probs[1, :].max(), warped_prob_sharp[1, :].max())
-        self.assertGreater(probs[1, :].min(), warped_prob_sharp[1, :].min())
-
-        # smooth peaks get lower, valleys get higher
-        self.assertGreater(probs[1, :].max(), warped_prob_smooth[1, :].max())
-        self.assertLess(probs[1, :].min(), warped_prob_smooth[1, :].min())
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
-    def test_repetition_penalty_dist_process(self):
-        input_ids = torch.tensor([[0, 1], [5, 0]], device=torch_device, dtype=torch.long)
-        vocab_size = 10
-
-        scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
-
-        # give values special values
-        scores[0, 0] = -(1 / vocab_size)
-        scores[1, 5] = 4 / vocab_size
-
-        rep_penalty_proc = RepetitionPenaltyLogitsProcessor(penalty=2.0)
-
-        processed_scores = rep_penalty_proc(input_ids, scores)
-
-        # check that values were correctly changed
-        self.assertAlmostEqual(processed_scores[0, 0].item(), -(1 / vocab_size) * 2)
-        self.assertAlmostEqual(processed_scores[0, 1].item(), (1 / vocab_size) / 2)
-
-        self.assertAlmostEqual(processed_scores[1, 0].item(), (1 / vocab_size) / 2)
-        self.assertAlmostEqual(processed_scores[1, 5].item(), (4 / vocab_size) / 2)
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
-    def test_encoder_repetition_penalty_dist_process(self):
-        input_ids = torch.tensor([[0, 1], [5, 0]], device=torch_device, dtype=torch.long)
-        vocab_size = 10
-
-        scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
-
-        # give values special values
-        scores[0, 0] = -(1 / vocab_size)
-        scores[1, 5] = 4 / vocab_size
-
-        rep_penalty_proc = EncoderRepetitionPenaltyLogitsProcessor(penalty=2.0, encoder_input_ids=input_ids)
-
-        processed_scores = rep_penalty_proc(input_ids, scores.clone())
-
-        # check that values were correctly changed
-        self.assertAlmostEqual(processed_scores[0, 0].item(), -(1 / vocab_size) / 2)
-        self.assertAlmostEqual(processed_scores[0, 1].item(), (1 / vocab_size) * 2)
-
-        self.assertAlmostEqual(processed_scores[1, 0].item(), (1 / vocab_size) * 2)
-        self.assertAlmostEqual(processed_scores[1, 5].item(), (4 / vocab_size) * 2)
-
-        # check that values not in the encoder ids were NOT changed
-        self.assertAlmostEqual(processed_scores[0, 2].item(), (1 / vocab_size))
-        self.assertAlmostEqual(processed_scores[1, 2].item(), (1 / vocab_size))
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
-    def test_top_k_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create ramp distribution
-        ramp_logits = (
-            torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(batch_size, 1)
-        )
-        ramp_logits[1:, : vocab_size // 2] = ramp_logits[1:, : vocab_size // 2] + vocab_size
-
-        top_k_warp = TopKLogitsWarper(3)
-
-        scores = top_k_warp(input_ids, ramp_logits)
-
-        # check that correct tokens are filtered
-        self.assertListEqual(torch.isinf(scores[0]).tolist(), 7 * [True] + 3 * [False])
-        self.assertListEqual(torch.isinf(scores[1]).tolist(), 2 * [True] + 3 * [False] + 5 * [True])
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == ramp_logits))
-
-        # check special cases
-        length = 5
-
-        logits = self._get_uniform_logits(batch_size=batch_size, length=length)
-        top_k_warp_safety_check = TopKLogitsWarper(top_k=1, filter_value=0.0, min_tokens_to_keep=3)
-
-        scores = top_k_warp_safety_check(input_ids, logits)
-        # uniform dist is not changed
-        self.assertListEqual((scores == 0.0).to(torch.long).sum(dim=-1).tolist(), [0, 0])
-
-        ramp_logits = torch.arange(length, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(batch_size, 1)
-        scores = top_k_warp_safety_check(input_ids, ramp_logits)
-
-        # min_tokens overwrites k: 3 tokens are kept => 2 tokens are nullified
-        self.assertListEqual((scores == 0.0).to(torch.long).sum(dim=-1).tolist(), [2, 2])
-
-    def test_top_p_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
-        dist = torch.log(
-            torch.tensor([[0.3, 0.1, 0.1, 0.5], [0.15, 0.3, 0.3, 0.25]], device=torch_device, dtype=torch.float)
-        )
-
-        top_p_warp = TopPLogitsWarper(0.8)
-        filtered_dist = torch.exp(top_p_warp(input_ids, dist))
-
-        # dist should be filtered to keep min num values so that sum is >= top_p
-        # exp (-inf) => 0
-        EXPECTED_FILTERED_DIST = torch.tensor(
-            [[0.3, 0.0, 0.0, 0.5], [0.0, 0.3, 0.3, 0.25]], device=torch_device, dtype=torch.float
-        )
-        self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(top_p_warp(input_ids, dist) == dist))
-
-        # check edge cases with negative and extreme logits
-        ramp_logits = torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(
-            batch_size, 1
-        ) - (vocab_size // 2)
-
-        # make ramp_logits more extreme
-        ramp_logits[1] = ramp_logits[1] * 100.0
-
-        # make sure at least 2 tokens are kept
-        top_p_warp = TopPLogitsWarper(0.9, min_tokens_to_keep=2, filter_value=0.0)
-        filtered_dist = top_p_warp(input_ids, ramp_logits)
-
-        # first batch should keep three tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
-        self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [3, 2])
-
-    def test_typical_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
-        dist = torch.log(
-            torch.tensor([[0.97, 0.01, 0.01, 0.01], [0.4, 0.2, 0.2, 0.2]], device=torch_device, dtype=torch.float)
-        )
-
-        typical_warp = TypicalLogitsWarper(0.5)
-        filtered_dist = torch.exp(typical_warp(input_ids, dist))
-
-        # dist should be filtered to keep min num values so that sum is >= 0.7
-        # exp (-inf) => 0
-        EXPECTED_FILTERED_DIST = torch.tensor(
-            [[0.97, 0.0, 0.0, 0.0], [0.0, 0.2, 0.2, 0.2]], device=torch_device, dtype=torch.float
-        )
-        self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(typical_warp(input_ids, dist) == dist))
-
-        # check special cases
-        length = 5
-
-        logits = self._get_uniform_logits(batch_size=batch_size, length=length)
-        typical_warp_safety_check = TypicalLogitsWarper(mass=0.5, filter_value=0.0, min_tokens_to_keep=3)
-
-        scores = typical_warp_safety_check(input_ids, logits)
-        # uniform dist is not changed
-        self.assertListEqual((scores == 0.0).to(torch.long).sum(dim=-1).tolist(), [0, 0])
-
-        # check edge cases with negative and extreme logits
-        ramp_logits = torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(
-            batch_size, 1
-        ) - (vocab_size // 2)
-
-        # make ramp_logits more extreme
-        ramp_logits[1] = ramp_logits[1] * 100.0
-
-        # make sure at least 2 tokens are kept
-        typical_warp = TypicalLogitsWarper(0.7, min_tokens_to_keep=2, filter_value=0.0)
-        filtered_dist = typical_warp(input_ids, ramp_logits)
-
-        # first batch should keep two tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
-        self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [2, 2])
-
-    def test_epsilon_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
-        dist = torch.log(
-            torch.tensor(
-                [[0.87, 0.099, 0.001, 0.03], [0.4, 0.299, 0.101, 0.2]], device=torch_device, dtype=torch.float
-            )
-        )
-
-        epsilon_warp = EpsilonLogitsWarper(0.1)
-        filtered_dist = torch.exp(epsilon_warp(input_ids, dist))
-
-        # dist should be filtered to only keep values with proba >= 0.1
-        # exp (-inf) => 0
-        EXPECTED_FILTERED_DIST = torch.tensor(
-            [[0.87, 0, 0, 0], [0.4, 0.299, 0.101, 0.2]], device=torch_device, dtype=torch.float
-        )
-        self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(epsilon_warp(input_ids, dist) == dist))
-
-        # check edge cases with negative and extreme logits
-        ramp_logits = torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(
-            batch_size, 1
-        ) - (vocab_size // 2)
-
-        # make ramp_logits more extreme
-        ramp_logits[1] = ramp_logits[1] * 100.0
-
-        # make sure at least 2 tokens are kept
-        epsilon_warp = EpsilonLogitsWarper(5e-2, min_tokens_to_keep=2, filter_value=0.0)
-        filtered_dist = epsilon_warp(input_ids, ramp_logits)
-
-        # first batch should keep 3 tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
-        self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [3, 2])
-
-    def test_eta_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
-        dist = torch.log(
-            torch.tensor([[0.0, 0.1, 0.8, 0.1], [0.01, 0.04, 0.9, 0.05]], device=torch_device, dtype=torch.float)
-        )
-
-        eta_warp = EtaLogitsWarper(0.0625)
-        filtered_dist = torch.exp(eta_warp(input_ids, dist))
-
-        # dist should be filtered to only keep values with proba >= min(0.0625, sqrt(0.0625) * e^-H(p))
-        # min(0.0625, 0.1320) is the cutoff for the first row and min(0.0625, 0.1644) is for the second
-        # where H is the entropy function and p is the probability vector.
-        # exp (-inf) => 0
-        EXPECTED_FILTERED_DIST = torch.tensor(
-            [[0.0, 0.1, 0.8, 0.1], [0.0, 0.0, 0.9, 0.0]], device=torch_device, dtype=torch.float
-        )
-        self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(eta_warp(input_ids, dist) == dist))
-
-        # check edge cases with negative and extreme logits
-        ramp_logits = torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(
-            batch_size, 1
-        ) - (vocab_size // 2)
-
-        # make ramp_logits more extreme
-        ramp_logits[1] = ramp_logits[1] * 100.0
-
-        # make sure at least 2 tokens are kept
-        eta_warp = EtaLogitsWarper(0.1, min_tokens_to_keep=2, filter_value=0.0)
-        filtered_dist = eta_warp(input_ids, ramp_logits)
-
-        # first batch should keep 2 tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
-        self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [2, 2])
-
-    def test_no_repeat_ngram_dist_processor(self):
-        vocab_size = 3
-        batch_size = 2
-
-        input_ids = torch.tensor([[1, 1, 2, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        no_repeat_proc_2_gram = NoRepeatNGramLogitsProcessor(2)
-        no_repeat_proc_3_gram = NoRepeatNGramLogitsProcessor(3)
-
-        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores.clone())
-        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores.clone())
-
-        # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(torch.isinf(filtered_scores_2_gram).tolist(), [[False, True, True], [True, False, False]])
-
-        # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(
-            torch.isinf(filtered_scores_3_gram).tolist(), [[False, False, False], [True, False, False]]
-        )
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == filtered_scores_2_gram))
-        self.assertFalse(torch.all(scores == filtered_scores_3_gram))
-
-    def test_encoder_no_repeat_ngram_dist_processor(self):
-        vocab_size = 3
-        num_beams = 2
-        batch_size = 1
-
-        encoder_input_ids = torch.tensor([1, 2, 1, 1], device=torch_device, dtype=torch.long)
-
-        input_ids = torch.tensor([[1, 2, 1], [8, 0, 2]], device=torch_device, dtype=torch.long)
-        scores = self._get_uniform_logits(batch_size * num_beams, vocab_size)
-
-        no_repeat_proc_2_gram = EncoderNoRepeatNGramLogitsProcessor(2, encoder_input_ids=encoder_input_ids)
-        no_repeat_proc_3_gram = EncoderNoRepeatNGramLogitsProcessor(3, encoder_input_ids=encoder_input_ids)
-
-        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores.clone())
-        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores.clone())
-
-        # 2-gram would forbid 1st and 2nd token at 1st beam and 1st token (0) at 2nd beam
-        self.assertListEqual(torch.isinf(filtered_scores_2_gram).tolist(), [[False, True, True], [False, True, False]])
-
-        # 3-gram would forbid 1st token at 1st beam and no token at 2nd beam
-        self.assertListEqual(
-            torch.isinf(filtered_scores_3_gram).tolist(), [[False, True, False], [False, False, False]]
-        )
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == filtered_scores_2_gram))
-        self.assertFalse(torch.all(scores == filtered_scores_3_gram))
-
-        # Batched input
-        vocab_size = 3
-        num_beams = 2
-        batch_size = 2
-        encoder_input_ids = torch.tensor([[1, 2, 1, 1], [0, 0, 2, 1]], device=torch_device, dtype=torch.long)
-
-        input_ids = torch.tensor([[1, 2, 1], [1, 0, 2], [0, 0, 0], [0, 2, 2]], device=torch_device, dtype=torch.long)
-        scores = self._get_uniform_logits(batch_size * num_beams, vocab_size)
-
-        no_repeat_proc_2_gram = EncoderNoRepeatNGramLogitsProcessor(2, encoder_input_ids=encoder_input_ids)
-        no_repeat_proc_3_gram = EncoderNoRepeatNGramLogitsProcessor(3, encoder_input_ids=encoder_input_ids)
-
-        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores.clone())
-        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores.clone())
-
-        # 2gram
-        # Batch 1
-        #   - Beam 1: tokens (1, 2) forbidden
-        #   - Beam 2: tokens (1) forbidden
-        # Batch 2
-        #   - Beam 1: tokens (0, 2) forbidden
-        #   - Beam 2: tokens (1) forbidden
-        self.assertListEqual(
-            torch.isinf(filtered_scores_2_gram).tolist(),
-            [[False, True, True], [False, True, False], [True, False, True], [False, True, False]],
-        )
-
-        # Batch 1
-        #   - Beam 1: tokens (1) forbidden
-        #   - Beam 2: tokens () forbidden
-        # Batch 2
-        #   - Beam 1: tokens (2) forbidden
-        #   - Beam 2: tokens () forbidden
-        self.assertListEqual(
-            torch.isinf(filtered_scores_3_gram).tolist(),
-            [[False, True, False], [False, False, False], [False, False, True], [False, False, False]],
-        )
-
-    def test_no_bad_words_dist_processor(self):
-        vocab_size = 5
-        batch_size = 2
-        eos_token_id = 4
-
-        input_ids = torch.tensor([[0, 1, 3, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
-        bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=bad_word_tokens, eos_token_id=eos_token_id)
-
-        filtered_scores = no_bad_words_dist_proc(input_ids, scores.clone())
-
-        # batch 1: 1st, 2nd, and 4th (0, 1, 3) token are forbidden
-        # batch 2: 1st, 2nd, and 3rd (0, 1, 2) token are forbidden
-        # Note that 5th element cannot be forbidden as it is EOS token
-        self.assertListEqual(
-            torch.isinf(filtered_scores).tolist(), [[True, True, False, True, False], [True, True, True, False, False]]
-        )
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == filtered_scores))
-
-        # check edge case
-        no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=[[4]], eos_token_id=eos_token_id)
-        filtered_scores = no_bad_words_dist_proc(input_ids, scores.clone())
-        self.assertTrue(torch.allclose(scores, filtered_scores, atol=1e-3))
-
-    def test_bias_dist_processor(self):
-        vocab_size = 5
-        batch_size = 2
-
-        input_ids = torch.tensor([[0, 1, 3, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
-        positive_bias = {(1,): 100.0, (4,): 100.0}
-        negative_bias = {(1, 0): -100.0, (0, 1, 2): -100.0, (1, 3, 1, 3): -100.0}
-        # biases the same termination twice, to ensure we can handle overlapping terminations (it won't have an effect
-        # on the test cases, though)
-        negative_bias.update({(1, 3, 1, 3, 1, 3): -100.0})
-        sequence_bias = {**positive_bias, **negative_bias}
-
-        # scores = 0 to facilitate checks
-        scores = torch.zeros((batch_size, vocab_size), dtype=torch.float, device=torch_device)
-
-        bias_dist_proc = SequenceBiasLogitsProcessor(sequence_bias=sequence_bias)
-        filtered_scores = bias_dist_proc(input_ids, scores.clone())
-
-        # batch 1: positive bias: tokens (1, 4); negative bias: tokens (0, 3); neutral: tokens (2)
-        # batch 2: positive bias: tokens (1, 4); negative bias: tokens (0, 2); neutral: tokens (3)
-        self.assertListEqual(
-            filtered_scores.tolist(), [[-100.0, 100.0, 0.0, -100.0, 100.0], [-100.0, 100.0, -100.0, 0.0, 100.0]]
-        )
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == filtered_scores))
-
-    def test_processor_list(self):
-        batch_size = 4
-        sequence_length = 10
-        vocab_size = 15
-        eos_token_id = 0
-
-        # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
-        input_ids_comp = input_ids.clone()
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_comp = scores.clone()
-
-        # instantiate all dist processors
-        min_dist_proc = MinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        temp_dist_warp = TemperatureLogitsWarper(temperature=0.5)
-        rep_penalty_proc = RepetitionPenaltyLogitsProcessor(penalty=2.0)
-        top_k_warp = TopKLogitsWarper(3)
-        top_p_warp = TopPLogitsWarper(0.8)
-        no_repeat_proc = NoRepeatNGramLogitsProcessor(2)
-        no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id)
-
-        # no processor list
-        scores = min_dist_proc(input_ids, scores)
-        scores = temp_dist_warp(input_ids, scores)
-        scores = rep_penalty_proc(input_ids, scores)
-        scores = top_k_warp(input_ids, scores)
-        scores = top_p_warp(input_ids, scores)
-        scores = no_repeat_proc(input_ids, scores)
-        scores = no_bad_words_dist_proc(input_ids, scores)
-
-        # with processor list
-        processor = LogitsProcessorList(
-            [
-                min_dist_proc,
-                temp_dist_warp,
-                rep_penalty_proc,
-                top_k_warp,
-                top_p_warp,
-                no_repeat_proc,
-                no_bad_words_dist_proc,
-            ]
-        )
-        scores_comp = processor(input_ids, scores_comp)
-
-        # scores should be equal
-        self.assertTrue(torch.allclose(scores, scores_comp, atol=1e-3))
-
-        # input_ids should never be changed
-        self.assertListEqual(input_ids.tolist(), input_ids_comp.tolist())
-
-    def test_prefix_constrained_logits_processor(self):
-        vocab_size = 5
-        batch_size = 2
-
-        input_ids = torch.tensor([[0, 1, 3, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        def prefix_allowed_tokens_fn(batch_id, inputs_ids):
-            return [[0, 1], [2, 3]][batch_id]
-
-        prefix_constrained_logits_proc = PrefixConstrainedLogitsProcessor(prefix_allowed_tokens_fn, 1)
-
-        filtered_scores = prefix_constrained_logits_proc(input_ids, scores.clone())
-
-        # batch 1: 1st, 2nd (0, 1) token are allowed
-        # batch 2: 3rd, 4th (2, 3) token are allowed
-        self.assertListEqual(
-            torch.isinf(filtered_scores).tolist(), [[False, False, True, True, True], [True, True, False, False, True]]
-        )
-
-        def empty_prefix_allowed_tokens_fn(batch_id, inputs_ids):
-            return []
-
-        prefix_constrained_logits_proc = PrefixConstrainedLogitsProcessor(empty_prefix_allowed_tokens_fn, 1)
-
-        self.assertRaises(ValueError, prefix_constrained_logits_proc, input_ids, scores)
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == filtered_scores))
-
-    def test_hamming_diversity(self):
-        vocab_size = 4
-        num_beams = 2
-        num_beam_groups = 2
-
-        scores = self._get_uniform_logits(num_beams, vocab_size)
-        # batch_idx = 0 -> index batch_idx * num_beam_groups -> idx = 0 * 2 = 0 -> penalises tokens 1
-        # batch_idx = 1 -> index batch_idx * num_beam_groups -> idx = 1 * 2 = 2 -> penalises tokens 1
-        current_tokens = torch.tensor([0, 3, 1, 2], device=torch_device, dtype=torch.long)
-
-        diversity_logits_processor = HammingDiversityLogitsProcessor(
-            diversity_penalty=1.0, num_beams=num_beams, num_beam_groups=num_beam_groups
-        )
-
-        processed_scores = diversity_logits_processor(None, scores, current_tokens, 1)
-
-        self.assertTrue(
-            torch.allclose(
-                processed_scores[0], torch.tensor([-0.7500, 0.2500, 0.2500, 0.2500], device=torch_device), atol=1e-3
-            )
-        )
-        self.assertTrue(
-            torch.allclose(
-                processed_scores[1], torch.tensor([0.2500, -0.7500, 0.2500, 0.2500], device=torch_device), atol=1e-3
-            )
-        )
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
-    def test_forced_bos_token_logits_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        bos_token_id = 0
-
-        logits_processor = ForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-
-        # check that all scores are -inf except the bos_token_id score
-        input_ids = ids_tensor((batch_size, 1), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        processed_scores = logits_processor(input_ids, scores)
-        self.assertTrue(torch.isneginf(processed_scores[:, bos_token_id + 1 :]).all())
-        # score for bos_token_id shold be zero
-        self.assertListEqual(processed_scores[:, bos_token_id].tolist(), 4 * [0])
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
-        # check that bos_token_id is not forced if current length is greater than 1
-        input_ids = ids_tensor((batch_size, 4), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        processed_scores = logits_processor(input_ids, scores)
-        self.assertFalse(torch.isinf(processed_scores).any())
-
-    def test_forced_eos_token_logits_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-        max_length = 5
-
-        logits_processor = ForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-
-        # check that all scores are -inf except the eos_token_id when max_length-1 is reached
-        input_ids = ids_tensor((batch_size, 4), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        processed_scores = logits_processor(input_ids, scores)
-        self.assertTrue(torch.isneginf(processed_scores[:, eos_token_id + 1 :]).all())
-        # score for eos_token_id should be zero
-        self.assertListEqual(processed_scores[:, eos_token_id].tolist(), 4 * [0])
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
-        # check that eos_token_id is not forced if max_length-1 is not reached
-        input_ids = ids_tensor((batch_size, 3), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        processed_scores = logits_processor(input_ids, scores)
-        self.assertFalse(torch.isinf(processed_scores).any())
-
-    def test_remove_nan_inf_logits_processor(self):
-        scores = torch.tensor(
-            [[0.0, 0.7, 0.8, float("nan")], [0.1, float("inf"), 0.3, float("-inf")]], device=torch_device
-        )
-        input_ids = ids_tensor((2, 4), vocab_size=20)
-
-        logits_processor = InfNanRemoveLogitsProcessor()
-
-        processed_scores = logits_processor(input_ids, scores)
-
-        self.assertTrue(
-            torch.allclose(
-                processed_scores,
-                torch.tensor(
-                    [
-                        [0.0, 0.7, 0.8, 0.0],
-                        [0.1, torch.finfo(processed_scores.dtype).max, 0.3, torch.finfo(processed_scores.dtype).min],
-                    ],
-                    device=torch_device,
-                ),
-                atol=1e-6,
-            )
-        )
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
-    def test_exponential_decay_length_penalty(self):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-
-        penalty_start = 5
-        penalty_factor = 1.1
-
-        input_ids = ids_tensor((batch_size, 2), vocab_size=vocab_size)
-        input_ids_seq_length = input_ids.shape[-1]
-
-        length_decay_processor = ExponentialDecayLengthPenalty(
-            exponential_decay_length_penalty=(penalty_start, penalty_factor),
-            eos_token_id=eos_token_id,
-            input_ids_seq_length=input_ids_seq_length,
-        )
-
-        # check that penalty is not applied before start
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_start = length_decay_processor(input_ids, scores)
-        self.assertListEqual(scores_before_start[:, eos_token_id].tolist(), scores[:, eos_token_id].tolist())
-
-        # check that penalty is applied after start
-        input_ids = ids_tensor((batch_size, 20), vocab_size=vocab_size)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_after_start = length_decay_processor(input_ids, scores)
-        self.assertTrue(torch.gt(scores_after_start[:, eos_token_id], scores[:, eos_token_id]).all())
-
-        # check the penalty increases negative scores
-        input_ids = ids_tensor((batch_size, 20), vocab_size=vocab_size)
-        scores = torch.neg(self._get_uniform_logits(batch_size, vocab_size))
-        scores_after_start = length_decay_processor(input_ids, scores)
-        self.assertTrue(torch.gt(scores_after_start[:, eos_token_id], scores[:, eos_token_id]).all())
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == scores_after_start))
-
-    def test_normalization(self):
-        input_ids = None
-
-        scores = torch.tensor(
-            [[-23.18, -29.96, -43.54, 47.77], [-33.58, -26.87, -32.96, 22.51]], device=torch_device, dtype=torch.float
-        )
-
-        logit_normalization = LogitNormalization()
-        normalized_scores = logit_normalization(input_ids, scores).exp()
-
-        ones = torch.ones(scores.shape[0], device=torch_device, dtype=torch.float)
-        self.assertTrue(normalized_scores.sum(dim=-1).allclose(ones))
-
-        self.assertTrue(normalized_scores.allclose(scores.softmax(dim=-1)))
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == normalized_scores))
-
-    def test_classifier_free_guidance(self):
-        class Namespace(dict):
-            pass
-
-        logits_uncond = torch.tensor([[[1.0, 0, 1.5]]])
-        logits_cond = torch.tensor([[[1.0, 1.0, 1.0]]])
-
-        def dummy_model(input_ids, attention_mask, use_cache=True, past_key_values=None):
-            out = Namespace()
-            out.logits = logits_uncond
-            out.past_key_values = None
-            return out
-
-        def lsm(x):
-            return torch.nn.functional.log_softmax(x, dim=-1)
-
-        # explicit unconditional prompt + attention mask
-        input_ids = torch.LongTensor([[0]])
-        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(
-            1.5, dummy_model, input_ids, torch.ones_like(input_ids, dtype=torch.long)
-        )
-        out = cfg(input_ids, logits_cond)[0, -1]
-
-        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
-
-        self.assertAlmostEqual(out[0].item(), res[0].item())
-        self.assertAlmostEqual(out[1].item(), res[1].item())
-        self.assertAlmostEqual(out[2].item(), res[2].item())
-
-        # explicit unconditional prompt
-        input_ids = torch.LongTensor([[0]])
-        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(1.5, dummy_model, input_ids)
-        out = cfg(input_ids, logits_cond)[0, -1]
-
-        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
-
-        self.assertAlmostEqual(out[0].item(), res[0].item())
-        self.assertAlmostEqual(out[1].item(), res[1].item())
-        self.assertAlmostEqual(out[2].item(), res[2].item())
-
-        # all implicit
-        input_ids = torch.LongTensor([[0]])
-        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(1.5, dummy_model)
-        out = cfg(input_ids, logits_cond)[0, -1]
-
-        res = (lsm(logits_uncond) + 1.5 * (lsm(logits_cond) - lsm(logits_uncond)))[0, -1]
-
-        self.assertAlmostEqual(out[0].item(), res[0].item())
-        self.assertAlmostEqual(out[1].item(), res[1].item())
-        self.assertAlmostEqual(out[2].item(), res[2].item())
-
-    def test_early_stop_processor(self):
-        input_ids = None
-        eos_token_id = 2
-        min_eos_p = 0.1  ## some small float
-
-        scores = self._get_uniform_logits(2, 4)
-        scores[0][eos_token_id] = -6  ## less than log(min_eos_p)
-
-        esp = BarkEosPrioritizerLogitsProcessor(eos_token_id=eos_token_id, min_eos_p=min_eos_p)
-        actual_scores = esp(input_ids, scores)
-        expected_scores_list = [
-            scores[0].tolist(),
-            [float("-inf"), float("-inf"), scores[0][0], float("-inf")],
-        ]
-        self.assertListEqual(actual_scores.tolist(), expected_scores_list)
-
-    def test_early_stop_processor_multi_eos(self):
-        input_ids = None
-        eos_token_id = [2, 3]
-        min_eos_p = 0.1  ## some small float
-
-        scores = self._get_uniform_logits(2, 4)
-        scores[0][eos_token_id] = -6  ## less than log(min_eos_p)
-
-        esp = BarkEosPrioritizerLogitsProcessor(eos_token_id=eos_token_id, min_eos_p=min_eos_p)
-        actual_scores = esp(input_ids, scores)
-        expected_scores_list = [
-            scores[0].tolist(),
-            [float("-inf"), float("-inf"), scores[0][0], scores[0][0]],
-        ]
-        self.assertListEqual(actual_scores.tolist(), expected_scores_list)
diff --git a/tests/transformers/tests/generation/test_stopping_criteria.py b/tests/transformers/tests/generation/test_stopping_criteria.py
deleted file mode 100644
index 0ce7838eee..0000000000
--- a/tests/transformers/tests/generation/test_stopping_criteria.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch
-
-from ..test_modeling_common import ids_tensor, torch_device
-
-
-if is_torch_available():
-    import torch
-    from transformers.generation import (
-        EosTokenCriteria,
-        MaxLengthCriteria,
-        MaxNewTokensCriteria,
-        MaxTimeCriteria,
-        StoppingCriteriaList,
-        validate_stopping_criteria,
-    )
-
-
-@require_torch
-class StoppingCriteriaTestCase(unittest.TestCase):
-    def _get_tensors(self, length):
-        batch_size = 3
-        vocab_size = 250
-
-        input_ids = ids_tensor((batch_size, length), vocab_size)
-        scores = torch.ones((batch_size, length), device=torch_device, dtype=torch.float) / length
-        return input_ids, scores
-
-    def test_list_criteria(self):
-        input_ids, scores = self._get_tensors(5)
-
-        criteria = StoppingCriteriaList(
-            [
-                MaxLengthCriteria(max_length=10),
-                MaxTimeCriteria(max_time=0.1),
-            ]
-        )
-
-        self.assertFalse(all(criteria(input_ids, scores)))
-
-        input_ids, scores = self._get_tensors(9)
-        self.assertFalse(all(criteria(input_ids, scores)))
-
-        input_ids, scores = self._get_tensors(10)
-        self.assertTrue(all(criteria(input_ids, scores)))
-
-    def test_max_length_criteria(self):
-        criteria = MaxLengthCriteria(max_length=10)
-
-        input_ids, scores = self._get_tensors(5)
-        self.assertFalse(all(criteria(input_ids, scores)))
-
-        input_ids, scores = self._get_tensors(9)
-        self.assertFalse(all(criteria(input_ids, scores)))
-
-        input_ids, scores = self._get_tensors(10)
-        self.assertTrue(all(criteria(input_ids, scores)))
-
-    def test_max_new_tokens_criteria(self):
-        criteria = MaxNewTokensCriteria(start_length=5, max_new_tokens=5)
-
-        input_ids, scores = self._get_tensors(5)
-        self.assertFalse(all(criteria(input_ids, scores)))
-
-        input_ids, scores = self._get_tensors(9)
-        self.assertFalse(all(criteria(input_ids, scores)))
-
-        input_ids, scores = self._get_tensors(10)
-        self.assertTrue(all(criteria(input_ids, scores)))
-
-        criteria_list = StoppingCriteriaList([criteria])
-        self.assertEqual(criteria_list.max_length, 10)
-
-    def test_max_time_criteria(self):
-        input_ids, scores = self._get_tensors(5)
-
-        criteria = MaxTimeCriteria(max_time=0.1)
-        self.assertFalse(all(criteria(input_ids, scores, needs_tensor_output=True)))
-        self.assertFalse(criteria(input_ids, scores, needs_tensor_output=False))
-
-        criteria = MaxTimeCriteria(max_time=0.1, initial_timestamp=time.time() - 0.2)
-        self.assertTrue(all(criteria(input_ids, scores, needs_tensor_output=True)))
-        self.assertTrue(criteria(input_ids, scores, needs_tensor_output=False))
-
-    def test_eos_token_criteria(self):
-        criteria = EosTokenCriteria(eos_token_id=0)
-
-        input_ids, scores = self._get_tensors(5)
-        input_ids[:, -1] = 0
-        self.assertTrue(all(criteria(input_ids, scores, needs_tensor_output=True)))
-        self.assertTrue(criteria(input_ids, scores, needs_tensor_output=False))
-
-        input_ids, scores = self._get_tensors(5)
-        input_ids[:2, -1] = 0
-        input_ids[2, -1] = 1
-        self.assertListEqual(criteria(input_ids, scores, needs_tensor_output=True).tolist(), [True, True, False])
-        self.assertFalse(criteria(input_ids, scores, needs_tensor_output=False))
-
-        input_ids, scores = self._get_tensors(5)
-        input_ids[:, -1] = 1
-        self.assertListEqual(criteria(input_ids, scores, needs_tensor_output=True).tolist(), [False, False, False])
-        self.assertFalse(criteria(input_ids, scores, needs_tensor_output=False))
-
-    def test_validate_stopping_criteria(self):
-        validate_stopping_criteria(StoppingCriteriaList([MaxLengthCriteria(10)]), 10)
-
-        with self.assertWarns(UserWarning):
-            validate_stopping_criteria(StoppingCriteriaList([MaxLengthCriteria(10)]), 11)
-
-        stopping_criteria = validate_stopping_criteria(StoppingCriteriaList(), 11)
-
-        self.assertEqual(len(stopping_criteria), 1)
diff --git a/tests/transformers/tests/generation/test_streamers.py b/tests/transformers/tests/generation/test_streamers.py
deleted file mode 100644
index 9efb572431..0000000000
--- a/tests/transformers/tests/generation/test_streamers.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from queue import Empty
-from threading import Thread
-
-from transformers import AutoTokenizer, TextIteratorStreamer, TextStreamer, is_torch_available
-from transformers.testing_utils import CaptureStdout, require_torch, torch_device
-
-from ..test_modeling_common import ids_tensor
-
-
-if is_torch_available():
-    import torch
-    from transformers import AutoModelForCausalLM
-
-
-@require_torch
-class StreamerTester(unittest.TestCase):
-    def test_text_streamer_matches_non_streaming(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        model.config.eos_token_id = -1
-
-        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
-        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
-        greedy_text = tokenizer.decode(greedy_ids[0])
-
-        with CaptureStdout() as cs:
-            streamer = TextStreamer(tokenizer)
-            model.generate(input_ids, max_new_tokens=10, do_sample=False, streamer=streamer)
-        # The greedy text should be printed to stdout, except for the final "\n" in the streamer
-        streamer_text = cs.out[:-1]
-
-        self.assertEqual(streamer_text, greedy_text)
-
-    def test_iterator_streamer_matches_non_streaming(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        model.config.eos_token_id = -1
-
-        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
-        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
-        greedy_text = tokenizer.decode(greedy_ids[0])
-
-        streamer = TextIteratorStreamer(tokenizer)
-        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        streamer_text = ""
-        for new_text in streamer:
-            streamer_text += new_text
-
-        self.assertEqual(streamer_text, greedy_text)
-
-    def test_text_streamer_skip_prompt(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        model.config.eos_token_id = -1
-
-        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
-        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
-        new_greedy_ids = greedy_ids[:, input_ids.shape[1] :]
-        new_greedy_text = tokenizer.decode(new_greedy_ids[0])
-
-        with CaptureStdout() as cs:
-            streamer = TextStreamer(tokenizer, skip_prompt=True)
-            model.generate(input_ids, max_new_tokens=10, do_sample=False, streamer=streamer)
-        # The greedy text should be printed to stdout, except for the final "\n" in the streamer
-        streamer_text = cs.out[:-1]
-
-        self.assertEqual(streamer_text, new_greedy_text)
-
-    def test_text_streamer_decode_kwargs(self):
-        # Tests that we can pass `decode_kwargs` to the streamer to control how the tokens are decoded. Must be tested
-        # with actual models -- the dummy models' tokenizers are not aligned with their models, and
-        # `skip_special_tokens=True` has no effect on them
-        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-        model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(torch_device)
-        model.config.eos_token_id = -1
-
-        input_ids = torch.ones((1, 5), device=torch_device).long() * model.config.bos_token_id
-        with CaptureStdout() as cs:
-            streamer = TextStreamer(tokenizer, skip_special_tokens=True)
-            model.generate(input_ids, max_new_tokens=1, do_sample=False, streamer=streamer)
-
-        # The prompt contains a special token, so the streamer should not print it. As such, the output text, when
-        # re-tokenized, must only contain one token
-        streamer_text = cs.out[:-1]  # Remove the final "\n"
-        streamer_text_tokenized = tokenizer(streamer_text, return_tensors="pt")
-        self.assertEqual(streamer_text_tokenized.input_ids.shape, (1, 1))
-
-    def test_iterator_streamer_timeout(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        model.config.eos_token_id = -1
-
-        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=0.001)
-        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-
-        # The streamer will timeout after 0.001 seconds, so an exception will be raised
-        with self.assertRaises(Empty):
-            streamer_text = ""
-            for new_text in streamer:
-                streamer_text += new_text
diff --git a/tests/transformers/tests/generation/test_utils.py b/tests/transformers/tests/generation/test_utils.py
deleted file mode 100644
index 8ffbad89d1..0000000000
--- a/tests/transformers/tests/generation/test_utils.py
+++ /dev/null
@@ -1,3286 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import inspect
-import unittest
-import warnings
-
-import numpy as np
-import pytest
-from transformers import is_torch_available, pipeline
-from transformers.testing_utils import require_torch, slow
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ..test_modeling_common import floats_tensor, ids_tensor
-from .test_framework_agnostic import GenerationIntegrationTestsMixin
-
-
-if is_torch_available():
-    import torch
-    from transformers import (
-        AutoModelForCausalLM,
-        AutoModelForSeq2SeqLM,
-        AutoModelForSpeechSeq2Seq,
-        AutoModelForVision2Seq,
-        AutoTokenizer,
-        BartForConditionalGeneration,
-        BartTokenizer,
-        GPT2LMHeadModel,
-        GPT2Tokenizer,
-        ImageGPTForCausalImageModeling,
-        PreTrainedModel,
-        SpeechEncoderDecoderModel,
-    )
-    from transformers.generation import (
-        BeamSampleDecoderOnlyOutput,
-        BeamSampleEncoderDecoderOutput,
-        BeamSearchDecoderOnlyOutput,
-        BeamSearchEncoderDecoderOutput,
-        BeamSearchScorer,
-        ConstrainedBeamSearchScorer,
-        DisjunctiveConstraint,
-        ForcedBOSTokenLogitsProcessor,
-        ForcedEOSTokenLogitsProcessor,
-        GenerateEncoderDecoderOutput,
-        GreedySearchDecoderOnlyOutput,
-        GreedySearchEncoderDecoderOutput,
-        HammingDiversityLogitsProcessor,
-        InfNanRemoveLogitsProcessor,
-        LogitsProcessorList,
-        MaxLengthCriteria,
-        MinLengthLogitsProcessor,
-        NoBadWordsLogitsProcessor,
-        NoRepeatNGramLogitsProcessor,
-        PhrasalConstraint,
-        RepetitionPenaltyLogitsProcessor,
-        SampleDecoderOnlyOutput,
-        SampleEncoderDecoderOutput,
-        StoppingCriteria,
-        StoppingCriteriaList,
-        TemperatureLogitsWarper,
-        TopKLogitsWarper,
-        TopPLogitsWarper,
-    )
-    from transformers.generation.candidate_generator import AssistedCandidateGenerator, CandidateGenerator
-    from transformers.generation.streamers import BaseStreamer
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-
-class GenerationTesterMixin:
-    model_tester = None
-    all_generative_model_classes = ()
-    input_name = "input_ids"
-
-    def _update_default_model_kwargs(self, model_kwargs):
-        model_kwargs["limit_hpu_graphs"] = False
-        model_kwargs["reuse_cache"] = False
-        model_kwargs["bucket_size"] = -1
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
-
-        # cut to half length & take max batch_size 3
-        sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:batch_size, :sequence_length]
-
-        # generate max 3 tokens
-        max_length = input_ids.shape[-1] + 3
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            if isinstance(config.eos_token_id, int):
-                config.eos_token_id = [config.eos_token_id]
-            config.pad_token_id = config.eos_token_id[0]
-        # TransfoXL has no attention mask
-        if "transfoxl" in config.__class__.__name__.lower():
-            attention_mask = None
-        else:
-            attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:batch_size, :sequence_length]
-
-        return config, input_ids, attention_mask, max_length
-
-    @staticmethod
-    def _get_logits_processor_and_kwargs(
-        input_length,
-        eos_token_id,
-        forced_bos_token_id=None,
-        forced_eos_token_id=None,
-        max_length=None,
-        diversity_penalty=None,
-    ):
-        process_kwargs = {
-            "min_length": input_length + 1 if max_length is None else max_length - 1,
-            "bad_words_ids": [[1, 0]],
-            "no_repeat_ngram_size": 2,
-            "repetition_penalty": 1.2,
-        }
-        logits_processor = LogitsProcessorList(
-            (
-                [
-                    HammingDiversityLogitsProcessor(diversity_penalty, num_beams=2, num_beam_groups=2),
-                ]
-                if diversity_penalty is not None
-                else []
-            )
-            + (
-                [
-                    MinLengthLogitsProcessor(process_kwargs["min_length"], eos_token_id),
-                ]
-                if eos_token_id is not None
-                else []
-            )
-            + (
-                [
-                    ForcedBOSTokenLogitsProcessor(forced_bos_token_id),
-                ]
-                if forced_bos_token_id is not None
-                else []
-            )
-            + (
-                [ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id)]
-                if forced_eos_token_id is not None
-                else []
-            )
-            + [
-                NoBadWordsLogitsProcessor(process_kwargs["bad_words_ids"], eos_token_id),
-                NoRepeatNGramLogitsProcessor(process_kwargs["no_repeat_ngram_size"]),
-                RepetitionPenaltyLogitsProcessor(process_kwargs["repetition_penalty"]),
-            ]
-        )
-        return process_kwargs, logits_processor
-
-    @staticmethod
-    def _get_warper_and_kwargs(num_beams):
-        warp_kwargs = {"top_k": 10, "top_p": 0.7, "temperature": 0.7}
-        logits_warper = LogitsProcessorList(
-            [
-                TemperatureLogitsWarper(warp_kwargs["temperature"]),
-                TopKLogitsWarper(top_k=warp_kwargs["top_k"], min_tokens_to_keep=(2 if num_beams > 1 else 1)),
-                TopPLogitsWarper(top_p=warp_kwargs["top_p"], min_tokens_to_keep=(2 if num_beams > 1 else 1)),
-            ]
-        )
-        return warp_kwargs, logits_warper
-
-    @staticmethod
-    def _get_beam_scorer_and_kwargs(batch_size, max_length, num_return_sequences=1):
-        beam_kwargs = {
-            "early_stopping": False,
-            "length_penalty": 2.0,
-            "num_beams": 2,
-            "num_return_sequences": num_return_sequences,
-        }
-        beam_scorer = BeamSearchScorer(
-            batch_size=batch_size,
-            num_beams=beam_kwargs["num_beams"],
-            device=torch_device,
-            length_penalty=beam_kwargs["length_penalty"],
-            do_early_stopping=beam_kwargs["early_stopping"],
-            num_beam_hyps_to_keep=num_return_sequences,
-        )
-        return beam_kwargs, beam_scorer
-
-    @staticmethod
-    def _get_diverse_beam_scorer_and_kwargs(batch_size, max_length, num_return_sequences=1):
-        beam_kwargs = {
-            "early_stopping": False,
-            "length_penalty": 2.0,
-            "num_beams": 2,
-            "num_return_sequences": num_return_sequences,
-            "num_beam_groups": 2,  # one beam per group
-            "diversity_penalty": 2.0,
-        }
-        beam_scorer = BeamSearchScorer(
-            batch_size=batch_size,
-            num_beams=beam_kwargs["num_beams"],
-            device=torch_device,
-            length_penalty=beam_kwargs["length_penalty"],
-            do_early_stopping=beam_kwargs["early_stopping"],
-            num_beam_hyps_to_keep=num_return_sequences,
-            num_beam_groups=beam_kwargs["num_beam_groups"],
-        )
-        return beam_kwargs, beam_scorer
-
-    @staticmethod
-    def _get_constrained_beam_scorer_and_kwargs(batch_size, max_length, constraints, num_return_sequences=1):
-        beam_kwargs = {
-            "early_stopping": False,
-            "length_penalty": 2.0,
-            "num_beams": num_return_sequences * 4,
-            "num_return_sequences": num_return_sequences,
-        }
-        beam_scorer = ConstrainedBeamSearchScorer(
-            batch_size=batch_size,
-            constraints=constraints,
-            num_beams=beam_kwargs["num_beams"],
-            device=torch_device,
-            length_penalty=beam_kwargs["length_penalty"],
-            do_early_stopping=beam_kwargs["early_stopping"],
-            num_beam_hyps_to_keep=num_return_sequences,
-        )
-        return beam_kwargs, beam_scorer
-
-    @staticmethod
-    def _get_encoder_outputs(
-        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
-    ):
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(
-            input_ids,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
-            num_interleave, dim=0
-        )
-        input_ids = torch.zeros_like(input_ids[:, :1]) + model._get_decoder_start_token_id()
-        attention_mask = None
-        return encoder_outputs, input_ids, attention_mask
-
-    @staticmethod
-    def _get_static_shapes():
-        return False
-
-    def _greedy_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        max_length,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        if model.config.is_encoder_decoder:
-            max_length = 4
-        logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-            input_ids.shape[-1],
-            eos_token_id=model.config.eos_token_id,
-            forced_bos_token_id=model.config.forced_bos_token_id,
-            forced_eos_token_id=model.config.forced_eos_token_id,
-            max_length=max_length,
-        )
-
-        kwargs = {}
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        model.generation_config.static_shapes = self._get_static_shapes()
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            num_beams=1,
-            max_length=max_length,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **logits_process_kwargs,
-            **model_kwargs,
-        )
-
-        if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
-                model,
-                input_ids,
-                attention_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-            kwargs["encoder_outputs"] = encoder_outputs
-
-        with torch.no_grad():
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            self._update_default_model_kwargs(model_kwargs)
-            output_greedy = model.greedy_search(
-                input_ids,
-                max_length=max_length,
-                logits_processor=logits_processor,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                output_scores=output_scores,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-                **model_kwargs,
-            )
-        return output_greedy, output_generate
-
-    def _sample_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        max_length,
-        num_return_sequences,
-        logits_processor,
-        logits_warper,
-        logits_warper_kwargs,
-        process_kwargs,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        torch.manual_seed(0)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        self._update_default_model_kwargs(model_kwargs)
-        model.generation_config.static_shapes = self._get_static_shapes()
-        output_generate = model.generate(
-            input_ids,
-            do_sample=True,
-            num_beams=1,
-            max_length=max_length,
-            num_return_sequences=num_return_sequences,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **logits_warper_kwargs,
-            **process_kwargs,
-            **model_kwargs,
-        )
-
-        torch.manual_seed(0)
-        kwargs = {}
-        if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
-                model,
-                input_ids,
-                attention_mask,
-                num_interleave=num_return_sequences,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-            kwargs["encoder_outputs"] = encoder_outputs
-        elif attention_mask is not None:
-            attention_mask = attention_mask.repeat_interleave(num_return_sequences, dim=0)
-
-        # prevent flaky generation test failures
-        logits_processor.append(InfNanRemoveLogitsProcessor())
-
-        with torch.no_grad():
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            self._update_default_model_kwargs(model_kwargs)
-            output_sample = model.sample(
-                input_ids.repeat_interleave(num_return_sequences, dim=0),
-                max_length=max_length,
-                logits_processor=logits_processor,
-                logits_warper=logits_warper,
-                output_scores=output_scores,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-                **model_kwargs,
-            )
-
-        return output_sample, output_generate
-
-    def _beam_search_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        max_length,
-        beam_scorer,
-        beam_kwargs,
-        logits_processor,
-        logits_process_kwargs,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        self._update_default_model_kwargs(model_kwargs)
-        model.generation_config.static_shapes = self._get_static_shapes()
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            max_length=max_length,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **beam_kwargs,
-            **logits_process_kwargs,
-            **model_kwargs,
-        )
-
-        # beam_search does not automatically interleave `batch_size` dim for `num_beams`
-        kwargs = {}
-        if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
-                model,
-                input_ids,
-                attention_mask,
-                num_interleave=beam_scorer.num_beams,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-            kwargs["encoder_outputs"] = encoder_outputs
-        elif attention_mask is not None:
-            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
-
-        with torch.no_grad():
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            self._update_default_model_kwargs(model_kwargs)
-            output_beam_search = model.beam_search(
-                input_ids.repeat_interleave(beam_scorer.num_beams, dim=0),
-                beam_scorer,
-                max_length=max_length,
-                logits_processor=logits_processor,
-                output_scores=output_scores,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-                **model_kwargs,
-            )
-        return output_generate, output_beam_search
-
-    def _beam_sample_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        max_length,
-        beam_scorer,
-        beam_kwargs,
-        logits_warper,
-        logits_warper_kwargs,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        torch.manual_seed(0)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        self._update_default_model_kwargs(model_kwargs)
-        output_generate = model.generate(
-            input_ids,
-            do_sample=True,
-            max_length=max_length,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **beam_kwargs,
-            **logits_warper_kwargs,
-            **model_kwargs,
-        )
-        # beam_search does not automatically interleave `batch_size` dim for `num_beams`
-        torch.manual_seed(0)
-        kwargs = {}
-        if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
-                model,
-                input_ids,
-                attention_mask,
-                num_interleave=beam_scorer.num_beams,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-            kwargs["encoder_outputs"] = encoder_outputs
-        elif attention_mask is not None:
-            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
-
-        # prevent flaky generation test failures
-        logits_processor = LogitsProcessorList()
-        logits_processor.append(InfNanRemoveLogitsProcessor())
-
-        with torch.no_grad():
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            self._update_default_model_kwargs(model_kwargs)
-            output_beam_sample = model.beam_sample(
-                input_ids.repeat_interleave(beam_scorer.num_beams, dim=0),
-                beam_scorer,
-                max_length=max_length,
-                logits_warper=logits_warper,
-                logits_processor=logits_processor,
-                output_scores=output_scores,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-                **model_kwargs,
-            )
-
-        return output_generate, output_beam_sample
-
-    def _group_beam_search_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        max_length,
-        beam_scorer,
-        beam_kwargs,
-        logits_processor,
-        logits_process_kwargs,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        self._update_default_model_kwargs(model_kwargs)
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            max_length=max_length,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **beam_kwargs,
-            **logits_process_kwargs,
-            **model_kwargs,
-        )
-
-        # group_beam_search does not automatically interleave `batch_size` dim for `num_beams`
-        kwargs = {}
-        if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
-                model,
-                input_ids,
-                attention_mask,
-                num_interleave=beam_scorer.num_beams,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-            kwargs["encoder_outputs"] = encoder_outputs
-        elif attention_mask is not None:
-            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
-
-        with torch.no_grad():
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            self._update_default_model_kwargs(model_kwargs)
-            output_group_beam_search = model.group_beam_search(
-                input_ids.repeat_interleave(beam_scorer.num_beams, dim=0),
-                beam_scorer,
-                max_length=max_length,
-                logits_processor=logits_processor,
-                output_scores=output_scores,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-                **model_kwargs,
-            )
-        return output_generate, output_group_beam_search
-
-    def _constrained_beam_search_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        max_length,
-        constrained_beam_scorer,
-        constraints,
-        beam_kwargs,
-        logits_processor,
-        logits_process_kwargs,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        self._update_default_model_kwargs(model_kwargs)
-        model.generation_config.static_shapes = self._get_static_shapes()
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            max_length=max_length,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            constraints=constraints,
-            **beam_kwargs,
-            **logits_process_kwargs,
-            **model_kwargs,
-        )
-
-        # group_beam_search does not automatically interleave `batch_size` dim for `num_beams`
-        kwargs = {}
-        if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
-                model,
-                input_ids,
-                attention_mask,
-                num_interleave=constrained_beam_scorer.num_beams,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-            kwargs["encoder_outputs"] = encoder_outputs
-        elif attention_mask is not None:
-            attention_mask = attention_mask.repeat_interleave(constrained_beam_scorer.num_beams, dim=0)
-
-        with torch.no_grad():
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            self._update_default_model_kwargs(model_kwargs)
-            output_group_beam_search = model.constrained_beam_search(
-                input_ids.repeat_interleave(constrained_beam_scorer.num_beams, dim=0),
-                constrained_beam_scorer,
-                max_length=max_length,
-                logits_processor=logits_processor,
-                output_scores=output_scores,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-                **model_kwargs,
-            )
-        return output_generate, output_group_beam_search
-
-    def _contrastive_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        max_length,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        contrastive_search_kwargs = {
-            "penalty_alpha": 0.6,
-            "top_k": 5,
-        }
-
-        if model.config.is_encoder_decoder:
-            max_length = 4
-        logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-            input_ids.shape[-1],
-            eos_token_id=model.config.eos_token_id,
-            forced_bos_token_id=model.config.forced_bos_token_id,
-            forced_eos_token_id=model.config.forced_eos_token_id,
-            max_length=max_length,
-        )
-
-        kwargs = {}
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        self._update_default_model_kwargs(model_kwargs)
-        model.generation_config.static_shapes = self._get_static_shapes()
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            num_beams=1,
-            max_length=max_length,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **logits_process_kwargs,
-            **model_kwargs,
-            **contrastive_search_kwargs,
-        )
-
-        if model.config.is_encoder_decoder:
-            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
-                model,
-                input_ids,
-                attention_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-            kwargs["encoder_outputs"] = encoder_outputs
-
-        with torch.no_grad():
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            self._update_default_model_kwargs(model_kwargs)
-            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])
-            output_contrastive = model.contrastive_search(
-                input_ids,
-                stopping_criteria=stopping_criteria,
-                logits_processor=logits_processor,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                output_scores=output_scores,
-                return_dict_in_generate=return_dict_in_generate,
-                **kwargs,
-                **model_kwargs,
-                **contrastive_search_kwargs,
-            )
-        return output_contrastive, output_generate
-
-    def test_greedy_generate(self):
-        # check `generate()` and `greedy_search()` are equal
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-            # test old generation output for backwards compatibility
-            model = model_class(config).to(torch_device).eval()
-            output_greedy, output_generate = self._greedy_generate(
-                model=model, input_ids=input_ids, attention_mask=attention_mask, max_length=max_length
-            )
-            self.assertListEqual(output_greedy.tolist(), output_generate.tolist())
-
-    def test_greedy_generate_dict_outputs(self):
-        for model_class in self.all_generative_model_classes:
-            # disable cache
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).to(torch_device).eval()
-            output_greedy, output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput)
-                self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput)
-                self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_greedy.sequences.tolist())
-
-            for output in (output_greedy, output_generate):
-                self._check_outputs(output, input_ids, model.config)
-
-    def test_greedy_generate_dict_outputs_use_cache(self):
-        for model_class in self.all_generative_model_classes:
-            # enable cache
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            if not hasattr(config, "use_cache"):
-                # only relevant if model has "use_cache"
-                return
-
-            config.use_cache = True
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            output_greedy, output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_greedy.sequences.tolist())
-
-            for output in (output_greedy, output_generate):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
-
-    def test_sample_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-            model = model_class(config).to(torch_device).eval()
-
-            if model.config.is_encoder_decoder:
-                max_length = 4
-
-            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                model.config.eos_token_id,
-                forced_bos_token_id=model.config.forced_bos_token_id,
-                forced_eos_token_id=model.config.forced_eos_token_id,
-                max_length=max_length,
-            )
-            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=2)
-
-            # check `generate()` and `sample()` are equal
-            output_sample, output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                num_return_sequences=1,
-                logits_processor=logits_processor,
-                logits_warper=logits_warper,
-                logits_warper_kwargs=logits_warper_kwargs,
-                process_kwargs=process_kwargs,
-            )
-            self.assertListEqual(output_sample.tolist(), output_generate.tolist())
-
-            # check `generate()` and `sample()` yield equal results for `num_return_sequences`
-            output_sample, output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                num_return_sequences=3,
-                logits_processor=logits_processor,
-                logits_warper=logits_warper,
-                logits_warper_kwargs=logits_warper_kwargs,
-                process_kwargs=process_kwargs,
-            )
-            self.assertListEqual(output_sample.tolist(), output_generate.tolist())
-
-    def test_sample_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            # disable cache
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).to(torch_device).eval()
-            if model.config.is_encoder_decoder:
-                max_length = 4
-
-            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                model.config.eos_token_id,
-                forced_bos_token_id=model.config.forced_bos_token_id,
-                forced_eos_token_id=model.config.forced_eos_token_id,
-                max_length=max_length,
-            )
-            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
-
-            output_sample, output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                num_return_sequences=2,
-                logits_processor=logits_processor,
-                logits_warper=logits_warper,
-                logits_warper_kwargs=logits_warper_kwargs,
-                process_kwargs=process_kwargs,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_sample, SampleEncoderDecoderOutput)
-                self.assertIsInstance(output_generate, SampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_sample, SampleDecoderOnlyOutput)
-                self.assertIsInstance(output_generate, SampleDecoderOnlyOutput)
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_sample.sequences.tolist())
-
-            for output in (output_sample, output_generate):
-                self._check_outputs(output, input_ids, model.config, num_return_sequences=2)
-
-    def test_beam_search_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            model = model_class(config).to(torch_device).eval()
-            if model.config.is_encoder_decoder:
-                max_length = 4
-
-            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                config.eos_token_id,
-                config.forced_bos_token_id,
-                config.forced_eos_token_id,
-                max_length,
-            )
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
-
-            # check `generate()` and `beam_search()` are equal
-            output_generate, output_beam_search = self._beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_process_kwargs=logits_process_kwargs,
-                logits_processor=logits_processor,
-            )
-
-            self.assertListEqual(output_generate.tolist(), output_beam_search.tolist())
-
-            if model.config.is_encoder_decoder:
-                max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
-
-            output_generate, output_beam_search = self._beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_process_kwargs=logits_process_kwargs,
-                logits_processor=logits_processor,
-            )
-            self.assertListEqual(output_generate.tolist(), output_beam_search.tolist())
-
-    def test_beam_search_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # disable cache
-            config.use_cache = False
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            model = model_class(config).to(torch_device).eval()
-            if model.config.is_encoder_decoder:
-                max_length = 4
-
-            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                config.eos_token_id,
-                config.forced_bos_token_id,
-                config.forced_eos_token_id,
-                max_length,
-            )
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
-            output_generate, output_beam_search = self._beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_process_kwargs=logits_process_kwargs,
-                logits_processor=logits_processor,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_search, BeamSearchEncoderDecoderOutput)
-                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_search, BeamSearchDecoderOnlyOutput)
-                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_beam_search.sequences.tolist())
-            self.assertTrue(
-                torch.allclose(output_generate["sequences_scores"], output_beam_search["sequences_scores"], atol=1e-3)
-            )
-            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
-            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
-
-            for output in (output_beam_search, output_generate):
-                self._check_outputs(output, input_ids, model.config, num_return_sequences=beam_scorer.num_beams)
-
-    def test_beam_search_generate_dict_outputs_use_cache(self):
-        for model_class in self.all_generative_model_classes:
-            # enable cache
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            if not hasattr(config, "use_cache"):
-                # only relevant if model has "use_cache"
-                return
-
-            model = model_class(config).to(torch_device).eval()
-            if model.config.is_encoder_decoder:
-                max_length = 4
-
-            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                config.eos_token_id,
-                config.forced_bos_token_id,
-                config.forced_eos_token_id,
-                max_length,
-            )
-
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
-
-            config.use_cache = True
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            output_beam, output_generate = self._beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_process_kwargs=logits_process_kwargs,
-                logits_processor=logits_processor,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_beam.sequences.tolist())
-
-            for output in (output_beam, output_generate):
-                self._check_outputs(
-                    output, input_ids, model.config, use_cache=True, num_return_sequences=beam_scorer.num_beams
-                )
-
-    @pytest.mark.skip("Beam search sampling is not supported by optimum-habana yet")
-    def test_beam_sample_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
-
-            model = model_class(config).to(torch_device).eval()
-
-            # check `generate()` and `beam_search()` are equal
-            if model.config.is_encoder_decoder:
-                max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
-
-            output_generate, output_beam_sample = self._beam_sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_warper=logits_warper,
-                logits_warper_kwargs=logits_warper_kwargs,
-            )
-            self.assertListEqual(output_generate.tolist(), output_beam_sample.tolist())
-
-    @pytest.mark.skip("Beam search sampling is not supported by optimum-habana yet")
-    def test_beam_sample_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # disable cache
-            config.use_cache = False
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            model = model_class(config).to(torch_device).eval()
-            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
-
-            if model.config.is_encoder_decoder:
-                max_length = 4
-            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
-
-            output_beam_sample, output_generate = self._beam_sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_warper=logits_warper,
-                logits_warper_kwargs=logits_warper_kwargs,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_sample, BeamSampleEncoderDecoderOutput)
-                self.assertIsInstance(output_generate, BeamSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_sample, BeamSampleDecoderOnlyOutput)
-                self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_beam_sample.sequences.tolist())
-            self.assertTrue(
-                torch.allclose(output_generate["sequences_scores"], output_beam_sample["sequences_scores"], atol=1e-3)
-            )
-            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
-            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
-
-            for output in (output_beam_sample, output_generate):
-                self._check_outputs(output, input_ids, model.config, num_return_sequences=beam_scorer.num_beams)
-
-    def test_generate_without_input_ids(self):
-        config, _, _, max_length = self._get_input_ids_and_config()
-
-        # if no bos token id => cannot generate from None
-        if config.bos_token_id is None:
-            return
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config).to(torch_device)
-            model.eval()
-
-            output_ids_generate = model.generate(do_sample=False, max_length=max_length, remove_invalid_values=True)
-            self.assertIsNotNone(output_ids_generate)
-
-    @pytest.mark.skip("Group beam search is not supported by optimum-habana")
-    def test_group_beam_search_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            model = model_class(config).to(torch_device).eval()
-            if model.config.is_encoder_decoder:
-                max_length = 4
-
-            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                config.eos_token_id,
-                config.forced_bos_token_id,
-                config.forced_eos_token_id,
-                max_length,
-                diversity_penalty=2.0,
-            )
-
-            # check `generate()` and `group_beam_search()` are equal
-            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
-            output_generate, output_group_beam_search = self._group_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_processor=logits_processor,
-                logits_process_kwargs=logits_process_kwargs,
-            )
-            self.assertListEqual(output_generate.tolist(), output_group_beam_search.tolist())
-
-            # check `generate()` and `group_beam_search()` are equal for `num_return_sequences`
-            num_return_sequences = 2
-            if model.config.is_encoder_decoder:
-                max_length = 4
-            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(
-                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
-            )
-            output_generate, output_group_beam_search = self._group_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_processor=logits_processor,
-                logits_process_kwargs=logits_process_kwargs,
-            )
-            self.assertListEqual(output_generate.tolist(), output_group_beam_search.tolist())
-
-    @pytest.mark.skip("Group beam search is not supported by optimum-habana")
-    def test_group_beam_search_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-            config.use_cache = False
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            model = model_class(config).to(torch_device).eval()
-            if model.config.is_encoder_decoder:
-                max_length = 4
-
-            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                config.eos_token_id,
-                config.forced_bos_token_id,
-                config.forced_eos_token_id,
-                max_length,
-                diversity_penalty=2.0,
-            )
-
-            num_return_sequences = 1
-            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(
-                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
-            )
-            output_generate, output_group_beam_search = self._group_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                beam_scorer=beam_scorer,
-                beam_kwargs=beam_kwargs,
-                logits_processor=logits_processor,
-                logits_process_kwargs=logits_process_kwargs,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_group_beam_search, BeamSearchEncoderDecoderOutput)
-                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_group_beam_search, BeamSearchDecoderOnlyOutput)
-                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_group_beam_search.sequences.tolist())
-            self.assertTrue(
-                torch.allclose(
-                    output_generate["sequences_scores"], output_group_beam_search["sequences_scores"], atol=1e-3
-                )
-            )
-            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
-            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
-
-            for output in (output_group_beam_search, output_generate):
-                self._check_outputs(
-                    output, input_ids, model.config, num_return_sequences=num_return_sequences * beam_scorer.num_beams
-                )
-
-    def test_constrained_beam_search_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            model = model_class(config).to(torch_device).eval()
-            max_length = 20
-
-            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                config.eos_token_id,
-                config.forced_bos_token_id,
-                config.forced_eos_token_id,
-                max_length,
-            )
-
-            # check `generate()` and `constrained_beam_search()` are equal
-            # Sample constraints
-            if not input_ids.dtype == torch.float32:
-                min_id = torch.min(input_ids) + 3
-                max_id = torch.max(input_ids)
-            else:
-                # otherwise this throws an error for Speech2TextModel since its inputs are floating points
-                min_id = 3
-                max_id = 100
-
-            force_tokens = torch.randint(min_id, max_id, (1, 2)).tolist()[0]
-            constraints = [
-                PhrasalConstraint(force_tokens),
-            ]
-
-            beam_kwargs, beam_scorer = self._get_constrained_beam_scorer_and_kwargs(
-                input_ids.shape[0], max_length, constraints, num_return_sequences=1
-            )
-            output_generate, output_beam_search = self._constrained_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                constrained_beam_scorer=beam_scorer,
-                constraints=constraints,
-                beam_kwargs=beam_kwargs,
-                logits_processor=logits_processor,
-                logits_process_kwargs=logits_process_kwargs,
-            )
-            self.assertListEqual(output_generate.tolist(), output_beam_search.tolist())
-            for generation_output in output_generate:
-                self._check_sequence_inside_sequence(force_tokens, generation_output)
-
-            # check `generate()` and `constrained_beam_search()` are equal for `num_return_sequences`
-            # Sample constraints
-            force_tokens = torch.randint(min_id, max_id, (1, 2)).tolist()[0]
-            constraints = [
-                PhrasalConstraint(force_tokens),
-            ]
-
-            num_return_sequences = 2
-            max_length = 20
-
-            beam_kwargs, beam_scorer = self._get_constrained_beam_scorer_and_kwargs(
-                input_ids.shape[0], max_length, constraints, num_return_sequences=num_return_sequences
-            )
-
-            output_generate, output_beam_search = self._constrained_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                constrained_beam_scorer=beam_scorer,
-                constraints=constraints,
-                beam_kwargs=beam_kwargs,
-                logits_processor=logits_processor,
-                logits_process_kwargs=logits_process_kwargs,
-            )
-            self.assertListEqual(output_generate.tolist(), output_beam_search.tolist())
-
-            for generation_output in output_generate:
-                self._check_sequence_inside_sequence(force_tokens, generation_output)
-
-    def test_constrained_beam_search_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # disable cache
-            config.use_cache = False
-
-            # It is important set set the eos_token_id to None to ensure that no sequences
-            # shorter than `max_length` can be generated which could lead to flaky circle ci
-            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
-            config.eos_token_id = None
-            config.forced_eos_token_id = None
-
-            model = model_class(config).to(torch_device).eval()
-            if model.config.is_encoder_decoder:
-                max_length = 20
-
-            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
-                input_ids.shape[-1],
-                config.eos_token_id,
-                config.forced_bos_token_id,
-                config.forced_eos_token_id,
-                max_length,
-            )
-
-            # Sample constraints
-            min_id = 3
-            max_id = model.config.vocab_size
-            force_tokens = torch.randint(min_id, max_id, (1, 2)).tolist()[0]
-            constraints = [
-                PhrasalConstraint(force_tokens),
-            ]
-
-            beam_kwargs, beam_scorer = self._get_constrained_beam_scorer_and_kwargs(
-                input_ids.shape[0], max_length, constraints, num_return_sequences=1
-            )
-            output_generate, output_beam_search = self._constrained_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                constrained_beam_scorer=beam_scorer,
-                constraints=constraints,
-                beam_kwargs=beam_kwargs,
-                logits_processor=logits_processor,
-                logits_process_kwargs=logits_process_kwargs,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_search, BeamSearchEncoderDecoderOutput)
-                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_search, BeamSearchDecoderOnlyOutput)
-                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_beam_search.sequences.tolist())
-            self.assertTrue(
-                torch.allclose(output_generate["sequences_scores"], output_beam_search["sequences_scores"], atol=1e-3)
-            )
-            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
-            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
-
-            for output in (output_beam_search, output_generate):
-                self._check_outputs(output, input_ids, model.config, num_return_sequences=beam_scorer.num_beams)
-
-    def test_contrastive_generate(self):
-        # check `generate()` and `contrastive_search()` are equal
-        for model_class in self.all_generative_model_classes:
-            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                return
-
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                return
-            config.use_cache = True
-            config.is_decoder = True
-
-            # test old generation output for backwards compatibility
-            model = model_class(config).to(torch_device).eval()
-            output_contrastive, output_generate = self._contrastive_generate(
-                model=model, input_ids=input_ids, attention_mask=attention_mask, max_length=max_length
-            )
-            self.assertListEqual(output_contrastive.tolist(), output_generate.tolist())
-
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        for model_class in self.all_generative_model_classes:
-            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                return
-
-            # enable cache
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                return
-            config.use_cache = True
-            config.is_decoder = True
-
-            model = model_class(config).to(torch_device).eval()
-            output_contrastive, output_generate = self._contrastive_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertListEqual(output_generate.sequences.tolist(), output_contrastive.sequences.tolist())
-
-            for output in (output_contrastive, output_generate):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
-
-    def test_contrastive_generate_low_memory(self):
-        # Check that choosing 'low_memory' does not change the model output
-        for model_class in self.all_generative_model_classes:
-            # won't fix: FSMT, Reformer, gptbigcode, and speech2text have a different cache variable type (and format).
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in ["fsmt", "reformer", "gptbigcode", "speech2text"]
-            ):
-                return
-
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                return
-
-            config.use_cache = True
-            config.is_decoder = True
-
-            # test output equality of low versus high memory
-            model = model_class(config).to(torch_device).eval()
-
-            low_output = model.generate(
-                input_ids,
-                top_k=4,
-                penalty_alpha=0.6,
-                low_memory=True,
-                max_length=max_length,
-                attention_mask=attention_mask,
-            )
-
-            high_output = model.generate(
-                input_ids,
-                top_k=4,
-                penalty_alpha=0.6,
-                low_memory=False,
-                max_length=max_length,
-                attention_mask=attention_mask,
-            )
-            self.assertListEqual(low_output.tolist(), high_output.tolist())
-
-        return
-
-    @pytest.mark.skip(reason="Assisted decoding not yet supported by optimum-habana")
-    @slow  # TODO(Joao): remove this. Some models (e.g. data2vec, xcom, roberta) have an error rate between 1 and 10%.
-    def test_assisted_decoding_matches_greedy_search(self):
-        # This test ensures that the assisted generation does not introduce output changes over greedy search.
-        # It breaks the pattern in the tests above, for multiple reasons:
-        # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to
-        # prepare the assistant encoder outputs in the main generate body);
-        # - assisted_decoding does not support `use_cache = False`
-        # - assisted_decoding does not support `batch_size > 1`
-
-        for model_class in self.all_generative_model_classes:
-            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                return
-            # may fix in the future: the following models fail with assisted decoding, and need model-specific fixes
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"]
-            ):
-                return
-
-            # This for loop is a naive and temporary effort to make the test less flaky.
-            failed = 0
-            for i in range(10):
-                # enable cache
-                config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
-
-                # NOTE: assisted generation only works with cache on at the moment.
-                if not hasattr(config, "use_cache"):
-                    return
-
-                config.use_cache = True
-                config.is_decoder = True
-                model = model_class(config).to(torch_device).eval()
-                output_greedy = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    max_length=max_length,
-                    num_beams=1,
-                    do_sample=False,
-                    output_scores=True,
-                    output_hidden_states=True,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                )
-                # Note: with assisted generate, if the same model is used as assistant, then all assistant tokens will
-                # be correct
-                output_assisted = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    max_length=max_length,
-                    num_beams=1,
-                    do_sample=False,
-                    assistant_model=model,
-                    output_scores=True,
-                    output_hidden_states=True,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                )
-
-                try:
-                    self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
-
-                    for output in (output_greedy, output_assisted):
-                        self._check_outputs(output, input_ids, model.config, use_cache=True)
-                except AssertionError:
-                    failed += 1
-                    if failed > 1:
-                        self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
-
-                        for output in (output_greedy, output_assisted):
-                            self._check_outputs(output, input_ids, model.config, use_cache=True)
-
-    @pytest.mark.skip(reason="Assisted decoding not yet supported by optimum-habana")
-    def test_assisted_decoding_sample(self):
-        # In this test we don't check assisted vs non-assisted output -- seeded assisted decoding with sample will not
-        # match sample for the same seed, as the forward pass does not return the exact same logits (due to matmul with
-        # different shapes, see https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
-        for model_class in self.all_generative_model_classes:
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest("Won't fix: old model with different cache format")
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
-                    "speech2text",
-                    "git",
-                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
-                ]
-            ):
-                self.skipTest("May fix in the future: need model-specific fixes")
-
-            # enable cache
-            config, input_ids, attention_mask, _ = self._get_input_ids_and_config(batch_size=1)
-
-            # NOTE: assisted generation only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
-
-            config.use_cache = True
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            # Sets assisted generation arguments such that:
-            # a) no EOS is generated, to ensure generation doesn't break early
-            # b) the assistant model always generates two tokens when it is called, to ensure the input preparation of
-            #    the assistant model is correct
-            # c) there are at least two forward passes in the main model, to ensure the input preparation of
-            #    the main model is correct
-            assistant_model = model
-            assistant_model.generation_config.num_assistant_tokens = 2  # see b)
-            assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
-            generation_kwargs = {
-                "eos_token_id": -1,  # see a)
-                "max_new_tokens": 4,  # see c)
-                "num_beams": 1,
-                "do_sample": True,
-                "assistant_model": assistant_model,
-                "output_scores": True,
-                "output_hidden_states": True,
-                "output_attentions": True,
-                "return_dict_in_generate": True,
-            }
-
-            #######################################################################
-            # Monkey patch assisted decoding function till SW issue is resolved
-            import copy
-            from types import MethodType
-            from typing import List, Optional, Union
-
-            from transformers.generation.utils import (
-                GenerateDecoderOnlyOutput,
-                _crop_past_key_values,
-                _prepare_attention_mask,
-                _prepare_token_type_ids,
-                _split_model_outputs,
-            )
-
-            def _speculative_sampling(
-                candidate_input_ids,
-                candidate_logits,
-                candidate_length,
-                new_logits,
-                last_assistant_token_is_eos,
-                max_matches,
-            ):
-                """
-                Applies sampling as in the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf, algorithm 1). Returns
-                the selected tokens, as well as the number of candidate matches.
-
-                NOTE: Unless otherwise stated, the variable names match those in the paper.
-                """
-                new_candidate_input_ids = candidate_input_ids[:, -candidate_length:]
-                # Gets the probabilities from the logits. q_i and p_i denote the assistant and model probabilities of the tokens
-                # selected by the assistant, respectively.
-                q = candidate_logits.softmax(dim=-1)
-                q_i = q[:, torch.arange(candidate_length), new_candidate_input_ids.squeeze()].squeeze(0, 1)
-                p = new_logits.softmax(dim=-1)
-                p_i = p[:, torch.arange(candidate_length), new_candidate_input_ids.squeeze()].squeeze(0, 1)
-                probability_ratio = p_i / q_i
-
-                # When probability_ratio > 1 (i.e. q_i(x) < p_i(x), or "assistant probability of the candidate token is smaller
-                # than the model probability for the same token"), keep the token. Otherwise reject with p = 1 - probability_ratio
-                # (= keep with p = probability_ratio). Keep all the tokens until the first rejection
-                r_i = torch.rand_like(probability_ratio)
-                is_accepted = r_i <= probability_ratio
-                n_matches = ((~is_accepted).cumsum(dim=-1) < 1).sum()  # this is `n` in algorithm 1
-
-                # Ensure we don't generate beyond max_len or an EOS token (not in algorithm 1, but needed for correct behavior)
-                if last_assistant_token_is_eos and n_matches == candidate_length:
-                    # Output length is assumed to be `n_matches + 1`. Since we won't generate another token with the target model
-                    # due to acceptance on EOS we fix `n_matches`
-                    n_matches -= 1
-                    valid_tokens = new_candidate_input_ids[:, : n_matches + 1]
-                else:
-                    n_matches = min(n_matches, max_matches)
-
-                    # Next token selection: if there is a rejection, adjust the distribution from the main model before sampling.
-                    gamma = min(candidate_logits.shape[1], max_matches)
-                    p_n_plus_1 = p[:, n_matches, :]
-                    if n_matches < gamma:
-                        q_n_plus_1 = q[:, n_matches, :]
-                        p_prime = torch.clamp((p_n_plus_1 - q_n_plus_1), min=0)
-                        p_prime.div_(p_prime.sum())
-                    else:
-                        p_prime = p_n_plus_1
-                    t = torch.multinomial(p_prime, num_samples=1).squeeze(1)[None, :]
-
-                    # The selected tokens include the matches (if any) plus the next sampled tokens
-                    if n_matches > 0:
-                        valid_tokens = torch.cat((new_candidate_input_ids[:, :n_matches], t), dim=-1)
-                    else:
-                        valid_tokens = t
-
-                return valid_tokens, n_matches
-
-            def assisted_decoding(
-                self,
-                input_ids: torch.LongTensor,
-                assistant_model: Optional["PreTrainedModel"] = None,
-                candidate_generator: Optional["CandidateGenerator"] = None,
-                do_sample: bool = False,
-                logits_processor: Optional[LogitsProcessorList] = None,
-                logits_warper: Optional[LogitsProcessorList] = None,
-                stopping_criteria: Optional[StoppingCriteriaList] = None,
-                pad_token_id: Optional[int] = None,
-                eos_token_id: Optional[Union[int, List[int]]] = None,
-                output_attentions: Optional[bool] = None,
-                output_hidden_states: Optional[bool] = None,
-                output_scores: Optional[bool] = None,
-                return_dict_in_generate: Optional[bool] = None,
-                synced_gpus: bool = False,
-                streamer: Optional["BaseStreamer"] = None,
-                **model_kwargs,
-            ):
-                r"""
-                Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
-                **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
-                candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
-                models.
-
-                <Tip warning={true}>
-
-                In most cases, you do not need to call [`~generation.GenerationMixin.candidate_decoding`] directly. Use
-                generate() instead. For an overview of generation strategies and code examples, check the [following
-                guide](../generation_strategies).
-
-                </Tip>
-
-                Parameters:
-                    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                        The sequence used as a prompt for the generation.
-                    candidate_generator (`CandidateGenerator`, *optional*):
-                        A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
-                        more information, the documentation of [`CandidateGenerator`] should be read. Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
-                    assistant_model (`PreTrainedModel`, *optional*):
-                        An assistant model that can be used to accelerate generation. The assistant model must have the exact
-                        same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
-                        is much faster than running generation with the model you're calling generate from. As such, the
-                        assistant model should be much smaller.
-                    do_sample (`bool`, *optional*, defaults to `False`):
-                        Whether or not to use sampling ; use greedy decoding otherwise.
-                    logits_processor (`LogitsProcessorList`, *optional*):
-                        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                        used to modify the prediction scores of the language modeling head applied at each generation step.
-                    logits_warper (`LogitsProcessorList`, *optional*):
-                        An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
-                        to warp the prediction score distribution of the language modeling head applied before multinomial
-                        sampling at each generation step.
-                    stopping_criteria (`StoppingCriteriaList`, *optional*):
-                        An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                        used to tell if the generation loop should stop.
-                    pad_token_id (`int`, *optional*):
-                        The id of the *padding* token.
-                    eos_token_id (`Union[int, List[int]]`, *optional*):
-                        The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-                    output_attentions (`bool`, *optional*, defaults to `False`):
-                        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                        returned tensors for more details.
-                    output_hidden_states (`bool`, *optional*, defaults to `False`):
-                        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                        for more details.
-                    output_scores (`bool`, *optional*, defaults to `False`):
-                        Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-                    return_dict_in_generate (`bool`, *optional*, defaults to `False`):
-                        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-                    synced_gpus (`bool`, *optional*, defaults to `False`):
-                        Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
-                    streamer (`BaseStreamer`, *optional*):
-                        Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                        through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-                    model_kwargs:
-                        Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
-                        If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-                Return:
-                    [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or
-                    `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-                    [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-                    `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
-                    `model.config.is_encoder_decoder=True`.
-
-                Examples:
-
-                ```python
-                >>> from transformers import (
-                ...     AutoTokenizer,
-                ...     AutoModelForCausalLM,
-                ...     LogitsProcessorList,
-                ...     MinLengthLogitsProcessor,
-                ...     StoppingCriteriaList,
-                ...     MaxLengthCriteria,
-                ... )
-
-                >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-                >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-                >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-                >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
-                >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
-                >>> input_prompt = "It might be possible to"
-                >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
-                >>> # instantiate logits processors
-                >>> logits_processor = LogitsProcessorList(
-                ...     [
-                ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
-                ...     ]
-                ... )
-                >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
-                >>> outputs = model.assisted_decoding(
-                ...     input_ids,
-                ...     assistant_model=assistant_model,
-                ...     logits_processor=logits_processor,
-                ...     stopping_criteria=stopping_criteria,
-                ... )
-                >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-                ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
-                ```"""
-                # handling deprecated arguments
-                if (assistant_model is None) == (candidate_generator is None):
-                    raise ValueError(
-                        "One (and only one) of `assistant_model` and `candidate_generator` should be defined."
-                    )
-
-                if assistant_model is not None:
-                    candidate_generator = AssistedCandidateGenerator(
-                        input_ids=input_ids,
-                        assistant_model=assistant_model,
-                        logits_processor=logits_processor,
-                        model_kwargs=model_kwargs,
-                        eos_token_id=eos_token_id,
-                    )
-                    warnings.warn(
-                        "Passing `assistant_model` to `assisted_decoding` is deprecated and will be removed in v4.38. "
-                        "Pass the `candidate_generator` argument instead.",
-                        FutureWarning,
-                    )
-
-                # init values
-                logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-                logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
-                stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-                pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-                eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
-                if eos_token_id is not None and pad_token_id is None:
-                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-                if isinstance(eos_token_id, int):
-                    eos_token_id = [eos_token_id]
-                eos_token_id_tensor = (
-                    torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
-                )
-                output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-                output_attentions = (
-                    output_attentions if output_attentions is not None else self.generation_config.output_attentions
-                )
-                output_hidden_states = (
-                    output_hidden_states
-                    if output_hidden_states is not None
-                    else self.generation_config.output_hidden_states
-                )
-                return_dict_in_generate = (
-                    return_dict_in_generate
-                    if return_dict_in_generate is not None
-                    else self.generation_config.return_dict_in_generate
-                )
-
-                # init attention / hidden states / scores tuples
-                scores = () if (return_dict_in_generate and output_scores) else None
-                decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-                cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-                decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-                # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-                if return_dict_in_generate and self.config.is_encoder_decoder:
-                    encoder_attentions = (
-                        model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-                    )
-                    encoder_hidden_states = (
-                        model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-                    )
-
-                # keep track of which sequences are already finished
-                unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-
-                # other auxiliary variables
-                max_len = stopping_criteria[0].max_length
-
-                this_peer_finished = False  # used by synced_gpus only
-                while True:
-                    if synced_gpus:
-                        # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
-                        # The following logic allows an early break if all peers finished generating their sequence
-                        this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
-                        # send 0.0 if we finished, 1.0 otherwise
-                        torch.dist.all_reduce(this_peer_finished_flag, op=torch.dist.ReduceOp.SUM)
-                        # did all peers finish? the reduced sum will be 0.0 then
-                        if this_peer_finished_flag.item() == 0.0:
-                            break
-
-                    cur_len = input_ids.shape[-1]
-
-                    #  1. Fetch candidate sequences from a `CandidateGenerator`
-                    candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
-                    candidate_input_ids = candidate_input_ids.to(self.device)
-                    if candidate_logits is not None:
-                        candidate_logits = candidate_logits.to(self.device)
-
-                    candidate_length = candidate_input_ids.shape[1] - input_ids.shape[1]
-                    last_assistant_token_is_eos = (
-                        ~candidate_input_ids[:, -1]
-                        .tile(eos_token_id_tensor.shape[0], 1)
-                        .ne(eos_token_id_tensor.unsqueeze(1))
-                        .prod(dim=0)
-                        .bool()
-                    )
-
-                    # 2. Use the original model to obtain the next token logits given the candidate sequence. We obtain
-                    # `candidate_length + 1` relevant logits from this process: in the event that all candidates are correct,
-                    # we use this forward pass to also pick the subsequent logits in the original model.
-
-                    # 2.1. Prepare the model inputs
-                    candidate_kwargs = copy.copy(model_kwargs)
-                    candidate_kwargs = _prepare_attention_mask(
-                        candidate_kwargs, candidate_input_ids.shape[1], self.config.is_encoder_decoder
-                    )
-                    candidate_kwargs = _prepare_token_type_ids(candidate_kwargs, candidate_input_ids.shape[1])
-
-                    model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **candidate_kwargs)
-
-                    # 2.2. Run a forward pass on the candidate sequence
-                    outputs = self(
-                        **model_inputs,
-                        output_attentions=output_attentions,
-                        output_hidden_states=output_hidden_states,
-                    )
-
-                    # 2.3. Process the new logits
-                    new_logits = outputs.logits[:, -candidate_length - 1 :]  # excludes the input prompt if present
-                    if len(logits_processor) > 0:
-                        for i in range(candidate_length + 1):
-                            new_logits[:, i, :] = logits_processor(
-                                candidate_input_ids[:, : cur_len + i], new_logits[:, i, :]
-                            )
-                    if len(logits_warper) > 0:
-                        for i in range(candidate_length + 1):
-                            new_logits[:, i, :] = logits_warper(
-                                candidate_input_ids[:, : cur_len + i], new_logits[:, i, :]
-                            )
-
-                    # 3. Select the accepted tokens. There are two possible cases:
-                    # Case 1: `do_sample=True` and we have logits for the candidates (originally from speculative decoding)
-                    # 👉 Apply algorithm 1 from the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf).
-                    max_matches = max_len - cur_len - 1
-                    if do_sample and candidate_logits is not None:
-                        valid_tokens, n_matches = _speculative_sampling(
-                            candidate_input_ids,
-                            candidate_logits,
-                            candidate_length,
-                            new_logits,
-                            last_assistant_token_is_eos,
-                            max_matches,
-                        )
-
-                    # Case 2: all other cases (originally from assisted generation) 👉 Compare the tokens selected from the
-                    # original model logits with the candidate tokens. We can keep the candidate tokens until the first
-                    # mismatch, or until the max length is reached.
-                    else:
-                        if do_sample:
-                            probs = new_logits.softmax(dim=-1)
-                            selected_tokens = torch.multinomial(probs[0, :, :], num_samples=1).squeeze(1)[None, :]
-                        else:
-                            selected_tokens = new_logits.argmax(dim=-1)
-
-                        candidate_new_tokens = candidate_input_ids[:, cur_len:]
-                        n_matches = ((~(candidate_new_tokens == selected_tokens[:, :-1])).cumsum(dim=-1) < 1).sum()
-
-                        # Ensure we don't generate beyond max_len or an EOS token
-                        if last_assistant_token_is_eos and n_matches == candidate_length:
-                            n_matches -= 1
-                        n_matches = min(n_matches, max_matches)
-                        valid_tokens = selected_tokens[:, : n_matches + 1]
-
-                    # 4. Update variables according to the number of matching assistant tokens. Remember: the token generated
-                    # by the model after the last candidate match is also valid, as it is generated from a correct sequence.
-                    # Because of this last token, assisted generation search reduces to a normal greedy search/sample if there
-                    # is no match.
-
-                    # 4.1. Get the valid continuation, after the matching tokens
-                    input_ids = torch.cat((input_ids, valid_tokens), dim=-1)
-                    if streamer is not None:
-                        streamer.put(valid_tokens.cpu())
-                    new_cur_len = input_ids.shape[-1]
-
-                    # 4.2. Discard past key values relative to unused assistant tokens
-                    new_cache_size = new_cur_len - 1
-                    outputs.past_key_values = _crop_past_key_values(self, outputs.past_key_values, new_cache_size)
-
-                    # 5. Update the candidate generation strategy if needed
-                    candidate_generator.update_candidate_strategy(input_ids, new_logits, n_matches)
-
-                    if synced_gpus and this_peer_finished:
-                        continue  # don't waste resources running the code we don't need
-
-                    # Store scores, attentions and hidden_states when required
-                    # Assistant: modified to append one tuple element per token, as in the other generation methods.
-                    if return_dict_in_generate:
-                        if output_scores:
-                            scores += tuple(new_logits[:, i, :] for i in range(n_matches + 1))
-
-                        if "past_key_values" not in model_kwargs:
-                            added_len = new_cur_len
-                        else:
-                            added_len = n_matches + 1
-
-                        if output_attentions:
-                            if self.config.is_encoder_decoder:
-                                cross_attentions = _split_model_outputs(
-                                    cross_attentions, outputs.cross_attentions, cur_len, added_len
-                                )
-                                decoder_attentions = _split_model_outputs(
-                                    decoder_attentions,
-                                    outputs.decoder_attentions,
-                                    cur_len,
-                                    added_len,
-                                    is_decoder_attention=True,
-                                )
-                            else:
-                                decoder_attentions = _split_model_outputs(
-                                    decoder_attentions,
-                                    outputs.attentions,
-                                    cur_len,
-                                    added_len,
-                                    is_decoder_attention=True,
-                                )
-                        if output_hidden_states:
-                            if self.config.is_encoder_decoder:
-                                decoder_hidden_states = _split_model_outputs(
-                                    decoder_hidden_states, outputs.decoder_hidden_states, cur_len, added_len
-                                )
-                            else:
-                                decoder_hidden_states = _split_model_outputs(
-                                    decoder_hidden_states, outputs.hidden_states, cur_len, added_len
-                                )
-
-                    model_kwargs = self._update_model_kwargs_for_generation(
-                        outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-                    )
-
-                    # if eos_token was found in one sentence, set sentence to finished
-                    if eos_token_id_tensor is not None:
-                        unfinished_sequences = unfinished_sequences.mul(
-                            input_ids[:, -1]
-                            .tile(eos_token_id_tensor.shape[0], 1)
-                            .ne(eos_token_id_tensor.unsqueeze(1))
-                            .prod(dim=0)
-                        )
-
-                        # stop when each sentence is finished
-                        if unfinished_sequences.max() == 0:
-                            this_peer_finished = True
-
-                    # stop if we exceed the maximum length
-                    if stopping_criteria(input_ids, scores):
-                        this_peer_finished = True
-
-                    if this_peer_finished and not synced_gpus:
-                        break
-
-                if streamer is not None:
-                    streamer.end()
-
-                if return_dict_in_generate:
-                    if self.config.is_encoder_decoder:
-                        return GenerateEncoderDecoderOutput(
-                            sequences=input_ids,
-                            scores=scores,
-                            encoder_attentions=encoder_attentions,
-                            encoder_hidden_states=encoder_hidden_states,
-                            decoder_attentions=decoder_attentions,
-                            cross_attentions=cross_attentions,
-                            decoder_hidden_states=decoder_hidden_states,
-                            past_key_values=model_kwargs.get("past_key_values"),
-                        )
-                    else:
-                        return GenerateDecoderOnlyOutput(
-                            sequences=input_ids,
-                            scores=scores,
-                            attentions=decoder_attentions,
-                            hidden_states=decoder_hidden_states,
-                            past_key_values=model_kwargs.get("past_key_values"),
-                        )
-                else:
-                    return input_ids
-
-            model.assisted_decoding = MethodType(assisted_decoding, model)
-
-            #######################################################################
-
-            output_assisted = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
-
-            self._check_outputs(output_assisted, input_ids, model.config, use_cache=True)
-
-    def test_generate_with_head_masking(self):
-        """Test designed for encoder-decoder models to ensure the attention head masking is used."""
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-            # We want to test only encoder-decoder models
-            if not config.is_encoder_decoder:
-                continue
-            model = model_class(config).to(torch_device)
-
-            head_masking = {
-                "head_mask": torch.zeros(config.encoder_layers, config.encoder_attention_heads, device=torch_device),
-                "decoder_head_mask": torch.zeros(
-                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
-                ),
-                "cross_attn_head_mask": torch.zeros(
-                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
-                ),
-            }
-
-            signature = inspect.signature(model.forward)
-            # We want to test only models where encoder/decoder head masking is implemented
-            if not set(head_masking.keys()) < {*signature.parameters.keys()}:
-                continue
-
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    num_beams=1,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                    remove_invalid_values=True,
-                    **{name: mask},
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
-
-    def test_left_padding_compatibility(self):
-        # The check done in this test is fairly difficult -- depending on the model architecture, passing the right
-        # position index for the position embeddings can still result in a different output, due to numerical masking.
-        # On the other hand, for some types of position embeddings, an incorrect position index can have a minimal
-        # impact on the output.
-        # There are two tricks employed to check whether left-padding compatibility is in place:
-        # 1 - To reduce the negative impact of the numerical attention mask on a correct position index, we set the
-        # padding size to 1.
-        # 2 - To reduce the chance of false positives (i.e. passing when it should be failing), we run the check
-        # multiple times with random inputs, and it has to pass with all of them.
-        # NOTE: because of 2), there is some chance of false positives in this test.
-
-        for model_class in self.all_generative_model_classes:
-            config, _, _, _ = self._get_input_ids_and_config()
-            if config.is_encoder_decoder:
-                continue  # skip for encoder-decoder models -- they don't need left-padding compatibility
-            model = model_class(config).to(torch_device).eval()
-            signature = inspect.signature(model.forward).parameters.keys()
-
-            no_failures = True
-            for _ in range(10):  # there may be false positives with 10 runs, we rely on the CI to catch the flakiness
-                _, input_ids, attention_mask, _ = self._get_input_ids_and_config()
-                model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
-                if "position_ids" in signature:
-                    position_ids = torch.cumsum(attention_mask, dim=-1) - 1
-                    position_ids.masked_fill_(attention_mask == 0, 1)
-                    model_kwargs["position_ids"] = position_ids
-                next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :]
-
-                pad_size = (input_ids.shape[0], 1)
-                padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id
-                padded_input_ids = torch.cat((padding, input_ids), dim=1)
-                padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
-                model_kwargs = {"input_ids": padded_input_ids, "attention_mask": padded_attention_mask}
-                if "position_ids" in signature:
-                    position_ids = torch.cumsum(padded_attention_mask, dim=-1) - 1
-                    position_ids.masked_fill_(padded_attention_mask == 0, 1)
-                    model_kwargs["position_ids"] = position_ids
-                next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
-                if not torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-7):
-                    no_failures = False
-                    break
-
-            self.assertTrue(no_failures)
-
-    def test_past_key_values_format(self):
-        # Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
-        # standard KV cache format is important for a consistent API (and for advanced generation methods).
-        for model_class in self.all_generative_model_classes:
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # If it doesn't support cache, pass the test
-            if not hasattr(config, "use_cache"):
-                return
-
-            model = model_class(config).to(torch_device)
-            if "use_cache" not in inputs:
-                inputs["use_cache"] = True
-            outputs = model(**inputs)
-
-            # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
-            if "past_key_values" not in outputs:
-                return
-
-            num_hidden_layers = (
-                getattr(config, "decoder_layers", None)
-                or getattr(config, "num_decoder_layers", None)
-                or config.num_hidden_layers
-            )
-            num_attention_heads = getattr(config, "decoder_attention_heads", config.num_attention_heads)
-            embed_dim = getattr(config, "d_model", config.hidden_size)
-            per_head_embed_dim = embed_dim // num_attention_heads
-
-            past_kv = outputs["past_key_values"]
-            self.assertEqual(len(past_kv), num_hidden_layers)
-
-            # Encoder-Decoder checks
-            if config.is_encoder_decoder:
-                encoder_num_attention_heads = config.encoder_attention_heads
-                encoder_per_head_embed_dim = embed_dim // encoder_num_attention_heads
-                batch_size, seq_length = inputs["decoder_input_ids"].shape
-                for i in range(num_hidden_layers):
-                    self.assertEqual(len(past_kv[i]), 4)  # K V for the decoder + K V for the encoder = 4
-                    self.assertEqual(
-                        past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                    )
-                    self.assertEqual(
-                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                    )
-                    # The sequence length for the encoder K V depends on the model. Since it is not manipulated in
-                    # autoregressive generation, I'm keeping the test general and not checking the 3rd dim
-                    self.assertEqual(
-                        (past_kv[i][2].shape[0], past_kv[i][2].shape[1], past_kv[i][2].shape[3]),
-                        (batch_size, encoder_num_attention_heads, encoder_per_head_embed_dim),
-                    )
-                    self.assertEqual(
-                        (past_kv[i][3].shape[0], past_kv[i][3].shape[1], past_kv[i][3].shape[3]),
-                        (batch_size, encoder_num_attention_heads, encoder_per_head_embed_dim),
-                    )
-
-            # Decoder-only checks
-            else:
-                # TODO: this line is only needed because of imagegpt, where "pixel_values" = "input_ids". Fix the
-                # tests in imagegpt such that `prepare_config_and_inputs_for_common` returns the later (and the other
-                # tests use it)
-                key = "input_ids" if "input_ids" in inputs else "pixel_values"
-                batch_size, seq_length = inputs[key].shape
-                for i in range(num_hidden_layers):
-                    self.assertEqual(len(past_kv[0]), 2)  # K V for the decoder = 2
-                    self.assertEqual(
-                        past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                    )
-                    self.assertEqual(
-                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                    )
-
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
-        # if fails, you should probably update the `prepare_inputs_for_generation` function
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, _, _ = self._get_input_ids_and_config()
-
-            # Ignore:
-            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
-            #   which would cause a mismatch),
-            config.pad_token_id = config.eos_token_id = -1
-            # b) embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge of the
-            #   variable that holds the scaling factor, which is model-dependent)
-            if hasattr(config, "scale_embedding"):
-                config.scale_embedding = False
-
-            # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the
-            # decoder)
-            if config.is_encoder_decoder:
-                continue
-
-            # Skip models without explicit support
-            model = model_class(config).to(torch_device).eval()
-            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
-                continue
-
-            # Traditional way of generating text
-            outputs_from_ids = model.generate(input_ids)
-            self.assertEqual(outputs_from_ids.shape, (2, 20))
-
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(input_ids, inputs_embeds=inputs_embeds)
-            self.assertListEqual(outputs_from_ids.tolist(), outputs_from_embeds.tolist())
-
-            # But if we pass different inputs_embeds, we should get different outputs
-            torch.manual_seed(0)
-            random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(input_ids, inputs_embeds=random_embeds)
-            with self.assertRaises(AssertionError):
-                self.assertListEqual(outputs_from_rand_embeds.tolist(), outputs_from_embeds.tolist())
-
-            # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
-            outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds, max_new_tokens=20 - inputs_embeds.shape[1]
-            )
-            self.assertListEqual(
-                outputs_from_embeds[:, inputs_embeds.shape[1] :].tolist(),
-                outputs_from_embeds_wo_ids[:, 1:].tolist(),
-            )
-
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, seq_length = input_ids.shape
-        num_sequences_in_output = batch_size * num_return_sequences
-        gen_len = (
-            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
-        )
-
-        # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
-
-        # Attentions
-        if config.is_encoder_decoder:
-            # encoder
-            self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
-            # decoder
-            self._check_attentions_for_generate(
-                num_sequences_in_output,
-                output.decoder_attentions,
-                min_length=1,
-                max_length=output.sequences.shape[-1],
-                config=config,
-                use_cache=use_cache,
-            )
-        else:
-            # if use_cache first input is equal to no use_cache, so skip here
-            attentions = output.attentions if not use_cache else output.attentions[1:]
-            min_length = seq_length if not use_cache else seq_length + 1
-            self._check_attentions_for_generate(
-                num_sequences_in_output,
-                attentions=attentions,
-                min_length=min_length,
-                max_length=output.sequences.shape[-1],
-                config=config,
-                use_cache=use_cache,
-            )
-
-        # Hidden States
-        if config.is_encoder_decoder:
-            # encoder
-            self._check_encoder_hidden_states_for_generate(
-                output.encoder_hidden_states, batch_size, config, seq_length
-            )
-
-            # decoder
-            self._check_hidden_states_for_generate(
-                num_sequences_in_output,
-                output.decoder_hidden_states,
-                min_length=1,
-                max_length=output.sequences.shape[-1],
-                config=config,
-                use_cache=use_cache,
-            )
-        else:
-            # if use_cache first input is equal to no use_cache, so skip here
-            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
-            min_length = seq_length if not use_cache else seq_length + 1
-            self._check_hidden_states_for_generate(
-                num_sequences_in_output,
-                hidden_states,
-                min_length=min_length,
-                max_length=output.sequences.shape[-1],
-                config=config,
-                use_cache=use_cache,
-            )
-
-    def _check_scores(self, batch_size, scores, length, config):
-        expected_shape = (batch_size, config.vocab_size)
-        self.assertIsInstance(scores, tuple)
-        self.assertEqual(len(scores), length)
-        self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores))
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_attentions in enumerate(attentions):
-            tgt_len = min_length + idx if not use_cache else 1
-            src_len = min_length + idx
-
-            expected_shape = (
-                batch_size * num_beam_groups,
-                config.num_attention_heads,
-                tgt_len,
-                src_len,
-            )
-            # check attn size
-            self.assertListEqual(
-                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
-            )
-
-    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, seq_length):
-        encoder_expected_shape = (batch_size, config.num_attention_heads, seq_length, seq_length)
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [layer_attentions.shape for layer_attentions in attentions],
-            [encoder_expected_shape] * len(attentions),
-        )
-
-    def _check_hidden_states_for_generate(
-        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
-            [True] * len(hidden_states),
-        )
-        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_hidden_states in enumerate(hidden_states):
-            seq_len = min_length + idx if not use_cache else 1
-            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
-            # check hidden size
-            self.assertListEqual(
-                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
-                [expected_shape] * len(iter_hidden_states),
-            )
-
-    def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, config, seq_length):
-        encoder_expected_shape = (batch_size, seq_length, config.hidden_size)
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [layer_hidden_states.shape for layer_hidden_states in hidden_states],
-            [encoder_expected_shape] * len(hidden_states),
-        )
-
-    def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
-        # check if tensor_1 inside tensor_2 or tensor_2 inside tensor_1.
-        # set to same device. we don't care what device.
-
-        if not isinstance(tensor_1, list):
-            tensor_1 = tensor_1.cpu().tolist()
-        if not isinstance(tensor_2, list):
-            tensor_2 = tensor_2.cpu().tolist()
-
-        in_order = len(tensor_1) <= len(tensor_2)
-        longer = tensor_2 if in_order else tensor_1
-        shorter = tensor_1 if in_order else tensor_2
-
-        flag = False
-        chunk_size = len(shorter)
-        for chunk_idx in range(len(longer) - chunk_size + 1):
-            subseq = longer[chunk_idx : chunk_idx + chunk_size]
-            if subseq == shorter:
-                flag = True
-                break
-
-        self.assertTrue(flag)
-
-
-@require_torch
-class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
-    # setting framework_dependent_parameters needs to be gated, just like its contents' imports
-    if is_torch_available():
-        framework_dependent_parameters = {
-            "AutoModelForCausalLM": AutoModelForCausalLM,
-            "AutoModelForSpeechSeq2Seq": AutoModelForSpeechSeq2Seq,
-            "AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM,
-            "AutoModelForVision2Seq": AutoModelForVision2Seq,
-            "LogitsProcessorList": LogitsProcessorList,
-            "MinLengthLogitsProcessor": MinLengthLogitsProcessor,
-            "create_tensor_fn": torch.tensor,
-            "floats_tensor": floats_tensor,
-            "return_tensors": "pt",
-        }
-
-    @slow
-    def test_diverse_beam_search(self):
-        # PT-only test: TF doesn't have a diverse beam search implementation
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
-        The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.
-        "Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first," People reports.
-        The couple announced the pregnancy in January, with an Instagram post. It is the first baby for both."""
-
-        bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(torch_device)
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        outputs = bart_model.generate(
-            input_ids,
-            num_beams=4,
-            num_return_sequences=2,
-            num_beam_groups=4,
-            diversity_penalty=2.0,
-            remove_invalid_values=True,
-        )
-
-        generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The couple announced the birth of their son, Silas Randall Timberlake, in a statement. Silas was the"
-                " middle name of Timberlake's maternal grandfather Bill Bomar. Randall is the musician's own middle"
-                " name, as well as his father's first. It is the first baby for both of them.",
-                "Justin Timberlake and Jessica Biel have a son. The baby is named Silas Randall Timberlake. It is the"
-                " first child for both. The couple announced the pregnancy in January. The name Silas is the middle"
-                " name of Timberlake's maternal grandfather. It's also his own middle name.",
-            ],
-        )
-
-    def test_max_length_backward_compat_greedy(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        max_length = 20
-        input_ids = input_ids.expand(2, -1)
-        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
-            batch_size=input_ids.shape[0],
-            model_input_name=bart_model.main_input_name,
-            model_kwargs=model_kwargs,
-            decoder_start_token_id=bart_model.config.decoder_start_token_id,
-            bos_token_id=bart_model.config.bos_token_id,
-        )
-
-        with self.assertWarns(UserWarning):
-            bart_model.greedy_search(
-                input_ids,
-                max_length=max_length,
-                pad_token_id=bart_model.config.pad_token_id,
-                eos_token_id=bart_model.config.eos_token_id,
-                **model_kwargs,
-            )
-
-    def test_max_length_backward_compat_sample(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        max_length = 20
-        input_ids = input_ids.expand(2, -1)
-        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
-            batch_size=input_ids.shape[0],
-            model_input_name=bart_model.main_input_name,
-            model_kwargs=model_kwargs,
-            decoder_start_token_id=bart_model.config.decoder_start_token_id,
-            bos_token_id=bart_model.config.bos_token_id,
-        )
-        with torch.no_grad():
-            with self.assertWarns(UserWarning):
-                bart_model.sample(
-                    input_ids,
-                    max_length=max_length,
-                    pad_token_id=bart_model.config.pad_token_id,
-                    eos_token_id=bart_model.config.eos_token_id,
-                    **model_kwargs,
-                )
-
-    def test_max_length_backward_compat_beam_search(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        batch_size = 1
-        max_length = 20
-        num_beams = 2
-
-        input_ids = input_ids.expand(2, -1)
-        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
-            batch_size=input_ids.shape[0],
-            model_input_name=bart_model.main_input_name,
-            model_kwargs=model_kwargs,
-            decoder_start_token_id=bart_model.config.decoder_start_token_id,
-            bos_token_id=bart_model.config.bos_token_id,
-        )
-
-        beam_scorer = BeamSearchScorer(
-            batch_size=batch_size,
-            num_beams=num_beams,
-            device=torch_device,
-        )
-        with self.assertWarns(UserWarning):
-            _ = bart_model.beam_search(
-                input_ids, num_beams=num_beams, max_length=max_length, beam_scorer=beam_scorer, **model_kwargs
-            )
-
-    def test_max_length_backward_compat_group_beam_search(self):
-        # PT-only test: TF doesn't have StoppingCriteria & group beam search
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        batch_size = 1
-        max_length = 20
-        num_beams = 6
-        num_beam_groups = 3
-        num_return_sequences = num_beams * batch_size
-
-        input_ids = input_ids.expand(6, -1)
-        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
-            batch_size=input_ids.shape[0],
-            model_input_name=bart_model.main_input_name,
-            model_kwargs=model_kwargs,
-            decoder_start_token_id=bart_model.config.decoder_start_token_id,
-            bos_token_id=bart_model.config.bos_token_id,
-        )
-
-        diverse_beam_scorer = BeamSearchScorer(
-            batch_size=batch_size,
-            num_beams=num_beams,
-            device=torch_device,
-            num_beam_hyps_to_keep=num_return_sequences,
-            num_beam_groups=num_beam_groups,
-        )
-        with self.assertWarns(UserWarning):
-            bart_model.group_beam_search(
-                input_ids, diverse_beam_scorer, num_beams=num_beams, max_length=max_length, **model_kwargs
-            )
-
-    def test_max_length_warning_if_different(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        batch_size = 1
-
-        max_length = 20
-        num_beams = 6
-        num_beam_groups = 3
-        num_return_sequences = num_beams * batch_size
-        stopping_criteria_max_length = 18
-        stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=stopping_criteria_max_length)])
-
-        # Greedy
-        input_ids = input_ids.expand(6, -1)
-        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
-        input_ids, model_kwargs = bart_model._prepare_decoder_input_ids_for_generation(
-            batch_size=input_ids.shape[0],
-            model_input_name=bart_model.main_input_name,
-            model_kwargs=model_kwargs,
-            decoder_start_token_id=bart_model.config.decoder_start_token_id,
-            bos_token_id=bart_model.config.bos_token_id,
-        )
-
-        with self.assertWarns(UserWarning):
-            bart_model.greedy_search(
-                input_ids,
-                max_length=max_length,
-                pad_token_id=bart_model.config.pad_token_id,
-                stopping_criteria=stopping_criteria,
-                eos_token_id=bart_model.config.eos_token_id,
-                **model_kwargs,
-            )
-
-        # Sample
-        with self.assertWarns(UserWarning):
-            with torch.no_grad():
-                bart_model.sample(
-                    input_ids,
-                    max_length=max_length,
-                    stopping_criteria=stopping_criteria,
-                    pad_token_id=bart_model.config.pad_token_id,
-                    eos_token_id=bart_model.config.eos_token_id,
-                    **model_kwargs,
-                )
-
-        # Beam
-        beam_scorer = BeamSearchScorer(
-            batch_size=batch_size,
-            num_beams=num_beams,
-            device=torch_device,
-        )
-        with self.assertWarns(UserWarning):
-            with torch.no_grad():
-                bart_model.beam_search(
-                    input_ids,
-                    num_beams=num_beams,
-                    stopping_criteria=stopping_criteria,
-                    max_length=max_length,
-                    beam_scorer=beam_scorer,
-                    **model_kwargs,
-                )
-
-        # Grouped beam search
-        diverse_beam_scorer = BeamSearchScorer(
-            batch_size=batch_size,
-            num_beams=num_beams,
-            device=torch_device,
-            num_beam_hyps_to_keep=num_return_sequences,
-            num_beam_groups=num_beam_groups,
-        )
-        with self.assertWarns(UserWarning):
-            bart_model.group_beam_search(
-                input_ids,
-                diverse_beam_scorer,
-                stopping_criteria=stopping_criteria,
-                num_beams=num_beams,
-                max_length=max_length,
-                **model_kwargs,
-            )
-
-    def test_custom_stopping_criteria_overload_error(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
-        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
-
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        stopping_criteria = StoppingCriteriaList()
-        stopping_criteria.append(MaxLengthCriteria(max_length=42))
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, stopping_criteria=stopping_criteria)
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=32)
-
-    def test_custom_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
-        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        class DummyCriteria(StoppingCriteria):
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                return input_ids.shape[-1] >= 20
-
-        stopping_criteria = StoppingCriteriaList()
-        stopping_criteria.append(DummyCriteria())
-
-        self.assertEqual(
-            list(bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=22).shape),
-            [1, 20],
-        )
-        self.assertEqual(
-            list(bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=18).shape),
-            [1, 18],
-        )
-
-    def test_stop_sequence_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        prompt = """Hello I believe in"""
-        generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-bart")
-        output = generator(prompt)
-        self.assertEqual(
-            output,
-            [
-                {
-                    "generated_text": (
-                        "Hello I believe in in in number number number number number number number number number"
-                    )
-                }
-            ],
-        )
-
-        output = generator(prompt, stop_sequence=" number")
-        self.assertEqual(output, [{"generated_text": "Hello I believe in in in number"}])
-
-    def test_generate_non_nlp_input_ids_as_kwarg(self):
-        # PT-only test: AFAIK there's no non-NLP model architecture in TF that supports `input_ids` as its only input
-        model = ImageGPTForCausalImageModeling.from_pretrained(
-            "hf-internal-testing/tiny-random-imagegpt", max_length=10
-        ).to(torch_device)
-        input_ids = ids_tensor((3, 5), vocab_size=10)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids).cpu()
-        output_sequences = model.generate(input_ids).cpu()
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (3, 10))
-
-    def test_generate_input_values_as_encoder_kwarg(self):
-        # PT-only test: AFAIK there's no generate-capable architecture in TF that supports `input_values` as its input
-        input_values = floats_tensor((2, 250))
-        model = SpeechEncoderDecoderModel.from_pretrained("hf-internal-testing/tiny-random-speech-encoder-decoder")
-        model = model.to(torch_device)
-        output_sequences_kwargs = model.generate(input_values=input_values, max_length=5).cpu()
-        output_sequences = model.generate(input_values, max_length=5).cpu()
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (2, 5))
-
-    def test_transition_scores_group_beam_search_encoder_decoder(self):
-        # PT-only test: TF doesn't have group beam search
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=2,
-            num_beam_groups=2,
-            num_return_sequences=2,
-            diversity_penalty=1.0,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        model = model.to(torch_device)
-
-        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        transition_scores_sum = transition_scores.sum(-1)
-
-        self.assertTrue(torch.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
-
-    @slow
-    def test_beam_search_example_integration(self):
-        # PT-only test: TF doesn't have a BeamSearchScorer
-        # exactly the example provided in the docstrings of beam search, which previously
-        # failed after directly copying from it. Refer to PR #15555
-        tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-        # lets run beam search using 3 beams
-        num_beams = 3
-        # define decoder start token ids
-        input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-        input_ids = input_ids * model.config.decoder_start_token_id
-
-        # add encoder_outputs to model keyword arguments
-        model_kwargs = {
-            "encoder_outputs": model.get_encoder()(
-                encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
-            )
-        }
-
-        # instantiate beam scorer
-        beam_scorer = BeamSearchScorer(
-            batch_size=1,
-            num_beams=num_beams,
-            device=model.device,
-        )
-
-        # instantiate logits processors
-        logits_processor = LogitsProcessorList(
-            [
-                MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-            ]
-        )
-
-        outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(outputs, ["Wie alt bist du?"])
-
-    @slow
-    def test_constrained_beam_search(self):
-        # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-
-        force_tokens = tokenizer("scared", add_prefix_space=True, add_special_tokens=False).input_ids
-        force_tokens_2 = tokenizer("big weapons", add_prefix_space=True, add_special_tokens=False).input_ids
-
-        constraints = [
-            PhrasalConstraint(force_tokens),
-            PhrasalConstraint(force_tokens_2),
-        ]
-
-        starting_text = ["The soldiers were not prepared and"]
-
-        input_ids = tokenizer(starting_text, return_tensors="pt").input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids,
-            constraints=constraints,
-            num_beams=10,
-            num_return_sequences=1,
-            no_repeat_ngram_size=1,
-            max_length=30,
-            remove_invalid_values=True,
-        )
-
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The soldiers were not prepared and didn't know what to do. They had no idea how they would react if"
-                " the enemy attacked them, big weapons scared"
-            ],
-        )
-
-    @slow
-    def test_constrained_beam_search_mixed(self):
-        # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-
-        force_phrase = tokenizer("scared", add_prefix_space=True, add_special_tokens=False).input_ids
-        flexible_phrases = tokenizer(
-            ["scream", "screams", "screaming", "screamed"], add_prefix_space=True, add_special_tokens=False
-        ).input_ids
-
-        constraints = [
-            PhrasalConstraint(force_phrase),
-            DisjunctiveConstraint(flexible_phrases),
-        ]
-
-        starting_text = ["The soldiers", "The child"]
-
-        input_ids = tokenizer(starting_text, return_tensors="pt").input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids,
-            constraints=constraints,
-            num_beams=10,
-            num_return_sequences=1,
-            no_repeat_ngram_size=1,
-            # max_length=20,
-            remove_invalid_values=True,
-        )
-
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The soldiers, who had been stationed at the base for more than a year before being evacuated"
-                " screaming scared",
-                "The child was taken to a local hospital where he died.\n 'I don't think screaming scared",
-            ],
-        )
-
-    @slow
-    def test_constrained_beam_search_mixed_mixin(self):
-        # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-
-        force_word = "scared"
-        force_flexible = ["scream", "screams", "screaming", "screamed"]
-
-        force_words_ids = [
-            tokenizer([force_word], add_prefix_space=True, add_special_tokens=False).input_ids,
-            tokenizer(force_flexible, add_prefix_space=True, add_special_tokens=False).input_ids,
-        ]
-
-        starting_text = ["The soldiers", "The child"]
-
-        input_ids = tokenizer(starting_text, return_tensors="pt").input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids,
-            force_words_ids=force_words_ids,
-            num_beams=10,
-            num_return_sequences=1,
-            no_repeat_ngram_size=1,
-            remove_invalid_values=True,
-        )
-
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The soldiers, who had been stationed at the base for more than a year before being evacuated"
-                " screaming scared",
-                "The child was taken to a local hospital where he died.\n 'I don't think screaming scared",
-            ],
-        )
-
-    @slow
-    def test_cfg_mixin(self):
-        model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-
-        input = tokenizer(["The dragon flew over Paris,"], return_tensors="pt", return_attention_mask=True)
-        input["input_ids"] = input["input_ids"].to(torch_device)
-        input["attention_mask"] = input["attention_mask"].to(torch_device)
-
-        outputs = model.generate(**input, max_new_tokens=32, guidance_scale=1.5)
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The dragon flew over Paris, landing in the Rue de la Bastille. The crowd was so excited "
-                'that they had to leave the city.\n\n"We\'re going to Paris!"\n'
-            ],
-        )
-
-        neg = tokenizer(["France,"], return_tensors="pt", return_attention_mask=True)
-        neg["input_ids"] = neg["input_ids"].to(torch_device)
-        neg["attention_mask"] = neg["attention_mask"].to(torch_device)
-        outputs = model.generate(
-            **input,
-            max_new_tokens=32,
-            guidance_scale=1.5,
-            negative_prompt_ids=neg["input_ids"],
-            negative_prompt_attention_mask=neg["attention_mask"],
-        )
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                'The dragon flew over Paris, landing on the pavement.\n\n"Paris!"\n\n"Paris!"\n\n"'
-                'Paris!"\n\n"Paris!"\n\n"Paris!"\n\n'
-            ],
-        )
-
-    @slow
-    def test_constrained_beam_search_example_translation_mixin(self):
-        # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        force_words = ["sind"]
-
-        input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-        force_words_ids = tokenizer(force_words, add_special_tokens=False).input_ids
-
-        outputs = model.generate(
-            input_ids,
-            force_words_ids=force_words_ids,
-            num_beams=10,
-            num_return_sequences=1,
-            no_repeat_ngram_size=1,
-            remove_invalid_values=True,
-        )
-
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(outputs, ["Wie alt sind Sie?"])
-
-    @slow
-    def test_constrained_beam_search_example_integration(self):
-        # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-        # lets run beam search using 5 beams
-        num_beams = 5
-        # define decoder start token ids
-        input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-        input_ids = input_ids * model.config.decoder_start_token_id
-
-        # add encoder_outputs to model keyword arguments
-        model_kwargs = {
-            "encoder_outputs": model.get_encoder()(
-                encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
-            )
-        }
-
-        constraint_str = "sind"
-        constraint_token_ids = tokenizer.encode(constraint_str)[:-1]  # remove eos token
-        constraints = [PhrasalConstraint(token_ids=constraint_token_ids)]
-
-        # instantiate beam scorer
-        beam_scorer = ConstrainedBeamSearchScorer(
-            batch_size=1, num_beams=num_beams, device=model.device, constraints=constraints
-        )
-
-        # instantiate logits processors
-        logits_processor = LogitsProcessorList(
-            [
-                MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-            ]
-        )
-
-        outputs = model.constrained_beam_search(
-            input_ids, beam_scorer, constraints=constraints, logits_processor=logits_processor, **model_kwargs
-        )
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(outputs, ["Wie alt sind Sie?"])
-
-    def test_constrained_beam_search_mixin_type_checks(self):
-        # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/t5-tiny-random")
-        model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/t5-tiny-random")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-        with self.assertRaises(ValueError):
-            force_words = ["sind"]
-            force_words_ids = tokenizer(force_words, return_tensors="pt").input_ids
-            model.generate(
-                input_ids,
-                force_words_ids=force_words_ids,
-                num_beams=10,
-                num_return_sequences=1,
-                no_repeat_ngram_size=1,
-                remove_invalid_values=True,
-            )
-
-        with self.assertRaises(ValueError):
-            force_words = ["sind"]
-            force_words_ids = [tokenizer(force_words, return_tensors="pt").input_ids]
-            model.generate(
-                input_ids,
-                force_words_ids=force_words_ids,
-                num_beams=10,
-                num_return_sequences=1,
-                no_repeat_ngram_size=1,
-                remove_invalid_values=True,
-            )
-
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, force_words_ids=[])
-
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, force_words_ids=[[-1]])
-
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, force_words_ids=[[[-1]]])
-
-    def test_contrastive_search_batched(self):
-        # PT-only test: TF doesn't have constrained beam search
-        # Tests that contrastive search works with batched inputs (i.e. has the same output as for non-batched inputs)
-        articles = ["Foo", "Bar Baz"]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(torch_device)
-
-        model.config.eos_token_id = None
-        input_ids_batched = tokenizer(articles, padding=True, return_tensors="pt").input_ids.to(torch_device)
-        input_ids = tokenizer(articles[1], return_tensors="pt").input_ids.to(torch_device)
-
-        output_sequences_batched = model.generate(
-            input_ids=input_ids_batched, penalty_alpha=0.6, top_k=4, return_dict_in_generate=True, output_scores=True
-        )
-        output_sequences = model.generate(
-            input_ids=input_ids, penalty_alpha=0.6, top_k=4, return_dict_in_generate=True, output_scores=True
-        )
-
-        batched_out = tokenizer.decode(output_sequences_batched.sequences[1], skip_special_tokens=True)
-        out = tokenizer.decode(output_sequences.sequences[0], skip_special_tokens=True)
-        self.assertEqual(batched_out, out)
-
-        # output_sequences_batched.scores[0][1] -> 1st set of logits, 2nd sequence
-        max_score_diff = (output_sequences_batched.scores[0][1] - output_sequences.scores[0][0]).abs().max()
-        self.assertTrue(max_score_diff < 1e-5)
-
-    def test_eos_token_id_int_and_list_top_k_top_sampling(self):
-        # Has TF equivalent: this test relies on random sampling
-        generation_kwargs = {
-            "do_sample": True,
-            "num_beams": 1,
-            "top_p": 0.7,
-            "top_k": 10,
-            "temperature": 0.7,
-        }
-        expectation = 20
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors="pt").to(torch_device)
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-
-        # Only some seeds will work both on CPU/GPU for a fixed `expectation` value.
-        # The selected seed is not guaranteed to work on all torch versions.
-        torch.manual_seed(1)
-        eos_token_id = 846
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        torch.manual_seed(1)
-        eos_token_id = [846, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_model_kwarg_encoder_signature_filtering(self):
-        # Has TF equivalent: ample use of framework-specific code
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        article = """Hugging Face is a technology company based in New York and Paris."""
-        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
-            torch_device
-        )
-        output = bart_model.generate(input_ids).cpu().numpy()
-
-        # Let's create a fake model that has a different signature. In particular, this fake model accepts "foo" as an
-        # argument. Because "foo" is not in the encoder signature and doesn't start with "decoder_", it will be part of
-        # the encoder kwargs prior to signature filtering, which would lead to an exception. But filtering kicks in and
-        # saves the day.
-        class FakeBart(BartForConditionalGeneration):
-            def forward(self, input_ids, foo=None, **kwargs):
-                return super().forward(input_ids, **kwargs)
-
-        bart_model = FakeBart.from_pretrained("hf-internal-testing/tiny-random-bart").to(torch_device)
-        fake_output = bart_model.generate(input_ids, foo="bar").cpu().numpy()
-        self.assertTrue(np.array_equal(output, fake_output))
-
-        # Encoder signature filtering only kicks in if it doesn't accept wildcard kwargs. The following test will fail
-        # because it doesn't do signature filtering.
-        class FakeEncoder(bart_model.model.encoder.__class__):
-            def forward(self, input_ids, **kwargs):
-                return super().forward(input_ids, **kwargs)
-
-        fake_encoder = FakeEncoder(bart_model.config, bart_model.model.shared).to(torch_device)
-        bart_model.model.encoder = fake_encoder
-
-        # Normal generation still works (the output will be different because the encoder weights are different)
-        fake_output = bart_model.generate(input_ids).cpu().numpy()
-        with self.assertRaises(TypeError):
-            # FakeEncoder.forward() accepts **kwargs -> no filtering -> type error due to unexpected input "foo"
-            bart_model.generate(input_ids, foo="bar")
-
-    def test_default_max_length_warning(self):
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(torch_device)
-
-        # Default generation config value of 20 -> emits warning
-        with self.assertWarns(UserWarning):
-            model.generate(input_ids)
-
-        # Explicitly setting max_length to 20 -> no warning
-        with warnings.catch_warnings(record=True) as warning_list:
-            model.generate(input_ids, max_length=20)
-            self.assertEqual(len(warning_list), 0)
-
-        # Generation config max_length != 20 -> no warning
-        with warnings.catch_warnings(record=True) as warning_list:
-            model.generation_config.max_length = 10
-            model.generation_config._from_model_config = False  # otherwise model.config.max_length=20 takes precedence
-            model.generate(input_ids)
-            self.assertEqual(len(warning_list), 0)
diff --git a/tests/transformers/tests/models/__init__.py b/tests/transformers/tests/models/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/albert/__init__.py b/tests/transformers/tests/models/albert/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/albert/test_modeling_albert.py b/tests/transformers/tests/models/albert/test_modeling_albert.py
deleted file mode 100644
index 460541d399..0000000000
--- a/tests/transformers/tests/models/albert/test_modeling_albert.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import AlbertConfig, is_torch_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        AlbertForMaskedLM,
-        AlbertForMultipleChoice,
-        AlbertForPreTraining,
-        AlbertForQuestionAnswering,
-        AlbertForSequenceClassification,
-        AlbertForTokenClassification,
-        AlbertModel,
-    )
-
-
-class AlbertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        embedding_size=16,
-        hidden_size=36,
-        num_hidden_layers=2,
-        # this needs to be the same as `num_hidden_layers`!
-        num_hidden_groups=2,
-        num_attention_heads=6,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_hidden_groups = num_hidden_groups
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return AlbertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            num_hidden_groups=self.num_hidden_groups,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = AlbertModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = AlbertForPreTraining(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            sentence_order_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, config.num_labels))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = AlbertForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = AlbertForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = AlbertForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = AlbertForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = AlbertForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            AlbertModel,
-            AlbertForPreTraining,
-            AlbertForMaskedLM,
-            AlbertForMultipleChoice,
-            AlbertForSequenceClassification,
-            AlbertForTokenClassification,
-            AlbertForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": AlbertModel,
-            "fill-mask": AlbertForMaskedLM,
-            "question-answering": AlbertForQuestionAnswering,
-            "text-classification": AlbertForSequenceClassification,
-            "token-classification": AlbertForTokenClassification,
-            "zero-shot": AlbertForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-                inputs_dict["sentence_order_label"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = AlbertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "albert/albert-base-v1"
-        model = AlbertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class AlbertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = AlbertModel.from_pretrained("albert/albert-base-v2")
-        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 11, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[[-0.6513, 1.5035, -0.2766], [-0.6515, 1.5046, -0.2780], [-0.6512, 1.5049, -0.2784]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/tests/transformers/tests/models/bert/__init__.py b/tests/transformers/tests/models/bert/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/bert/test_modeling_bert.py b/tests/transformers/tests/models/bert/test_modeling_bert.py
deleted file mode 100644
index e77d689497..0000000000
--- a/tests/transformers/tests/models/bert/test_modeling_bert.py
+++ /dev/null
@@ -1,674 +0,0 @@
-
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-
-from transformers import BertConfig, is_torch_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import CaptureLogger, require_torch, require_torch_accelerator, slow, torch_device
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForNextSentencePrediction,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        BertLMHeadModel,
-        BertModel,
-        logging,
-    )
-
-
-class BertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        """
-        Returns a tiny configuration by default.
-        """
-        return BertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = BertModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = BertLMHeadModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_model_for_causal_lm_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = BertLMHeadModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = BertLMHeadModel(config=config).to(torch_device).eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertForNextSentencePrediction(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertForPreTraining(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = BertForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = BertForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = BertForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BertModel,
-            BertLMHeadModel,
-            BertForMaskedLM,
-            BertForMultipleChoice,
-            BertForNextSentencePrediction,
-            BertForPreTraining,
-            BertForQuestionAnswering,
-            BertForSequenceClassification,
-            BertForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (BertLMHeadModel,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": BertModel,
-            "fill-mask": BertForMaskedLM,
-            "question-answering": BertForQuestionAnswering,
-            "text-classification": BertForSequenceClassification,
-            "text-generation": BertLMHeadModel,
-            "token-classification": BertForTokenClassification,
-            "zero-shot": BertForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-                inputs_dict["next_sentence_label"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = BertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_warning_if_padding_and_no_attention_mask(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-
-        # Set pad tokens in the input_ids
-        input_ids[0, 0] = config.pad_token_id
-
-        # Check for warnings if the attention_mask is missing.
-        logger = logging.get_logger("transformers.modeling_utils")
-        # clear cache so we can test the warning is emitted (from `warning_once`).
-        logger.warning_once.cache_clear()
-
-        with CaptureLogger(logger) as cl:
-            model = BertModel(config=config)
-            model.to(torch_device)
-            model.eval()
-            model(input_ids, attention_mask=None, token_type_ids=token_type_ids)
-        self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        model = BertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    @require_torch_accelerator
-    def test_torchscript_device_change(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # BertForMultipleChoice behaves incorrectly in JIT environments.
-            if model_class == BertForMultipleChoice:
-                return
-
-            config.torchscript = True
-            model = model_class(config=config)
-
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            traced_model = torch.jit.trace(
-                model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu"))
-            )
-
-            with tempfile.TemporaryDirectory() as tmp:
-                torch.jit.save(traced_model, os.path.join(tmp, "bert.pt"))
-                loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device)
-                loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
-
-
-@require_torch
-class BertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = BertModel.from_pretrained("google-bert/bert-base-uncased")
-        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 11, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor([[[0.4249, 0.1008, 0.7531], [0.3771, 0.1188, 0.7467], [0.4152, 0.1098, 0.7108]]])
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head_relative_embedding_key(self):
-        model = BertModel.from_pretrained("zhiheng-huang/bert-base-uncased-embedding-relative-key")
-        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 11, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[[0.0756, 0.3142, -0.5128], [0.3761, 0.3462, -0.5477], [0.2052, 0.3760, -0.1240]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head_relative_embedding_key_query(self):
-        model = BertModel.from_pretrained("zhiheng-huang/bert-base-uncased-embedding-relative-key-query")
-        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 11, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[[0.6496, 0.3784, 0.8203], [0.8148, 0.5656, 0.2636], [-0.0681, 0.5597, 0.7045]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
-
diff --git a/tests/transformers/tests/models/bridgetower/__init__.py b/tests/transformers/tests/models/bridgetower/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/transformers/tests/models/bridgetower/test_modeling_bridgetower.py
deleted file mode 100644
index 58b9f62478..0000000000
--- a/tests/transformers/tests/models/bridgetower/test_modeling_bridgetower.py
+++ /dev/null
@@ -1,661 +0,0 @@
-
-# coding=utf-8
-# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch BridgeTower model. """
-
-import tempfile
-import unittest
-
-import numpy as np
-
-from transformers import (
-    BridgeTowerConfig,
-    BridgeTowerTextConfig,
-    BridgeTowerVisionConfig,
-    is_torch_available,
-    is_vision_available,
-)
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        BridgeTowerForContrastiveLearning,
-        BridgeTowerForImageAndTextRetrieval,
-        BridgeTowerForMaskedLM,
-        BridgeTowerModel,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import BridgeTowerProcessor
-
-
-class BridgeTowerTextModelTester:
-    def __init__(
-        self,
-        parent,
-        hidden_act="gelu",
-        hidden_size=64,
-        initializer_factor=1,
-        layer_norm_eps=1e-05,
-        num_attention_heads=4,
-        num_hidden_layers=2,
-        intermediate_size=128,
-        tie_word_embeddings=False,
-        output_hidden_states=False,
-    ):
-        self.parent = parent
-        self.hidden_act = hidden_act
-        self.hidden_size = hidden_size
-        self.initializer_factor = initializer_factor
-        self.layer_norm_eps = layer_norm_eps
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.tie_word_embeddings = tie_word_embeddings
-        self.vocab_size = 99
-        self.seq_length = 4
-        self.batch_size = 1
-        self.is_training = False
-        self.output_hidden_states = output_hidden_states
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask
-
-    def get_config(self):
-        return BridgeTowerTextConfig(
-            hidden_act=self.hidden_act,
-            hidden_size=self.hidden_size,
-            initializer_factor=self.initializer_factor,
-            layer_norm_eps=self.layer_norm_eps,
-            num_attention_heads=self.num_attention_heads,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            tie_word_embeddings=self.tie_word_embeddings,
-            output_hidden_states=self.output_hidden_states,
-            vocab_size=self.vocab_size,
-        )
-
-
-class BridgeTowerImageModelTester:
-    def __init__(
-        self,
-        parent,
-        hidden_size=64,
-        initializer_factor=1,
-        layer_norm_eps=1e-05,
-        num_hidden_layers=2,
-        init_layernorm_from_vision_encoder=False,
-        output_hidden_states=False,
-        image_size=64,
-    ):
-        self.parent = parent
-        self.hidden_size = hidden_size
-        self.initializer_factor = initializer_factor
-        self.layer_norm_eps = layer_norm_eps
-        self.num_hidden_layers = num_hidden_layers
-        self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
-        self.num_channels = 3
-        self.num_image_features = 17
-        self.batch_size = 1
-        self.image_size = image_size
-        self.is_training = False
-        self.output_hidden_states = output_hidden_states
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        pixel_mask = random_attention_mask([self.batch_size, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values, pixel_mask
-
-    def get_config(self):
-        return BridgeTowerVisionConfig(
-            hidden_size=self.hidden_size,
-            initializer_factor=self.initializer_factor,
-            layer_norm_eps=self.layer_norm_eps,
-            num_hidden_layers=self.num_hidden_layers,
-            init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder,
-            num_channels=self.num_channels,
-            num_image_features=self.num_image_features,
-            batch_size=self.batch_size,
-            image_size=self.image_size,
-            is_training=self.is_training,
-            output_hidden_states=self.output_hidden_states,
-        )
-
-
-class BridgeTowerModelTester:
-    def __init__(
-        self,
-        parent,
-        text_kwargs=None,
-        vision_kwargs=None,
-        share_cross_modal_transformer_layers=True,
-        share_link_tower_layers=False,
-        link_tower_type="add",
-        init_layernorm_from_vision_encoder=False,
-        contrastive_hidden_size=512,
-        logit_scale_init_value=2.6592,
-        hidden_size=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=128,
-    ):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = BridgeTowerTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = BridgeTowerImageModelTester(parent, **vision_kwargs)
-
-        self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
-        self.share_link_tower_layers = share_link_tower_layers
-        self.link_tower_type = link_tower_type
-        self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
-        self.contrastive_hidden_size = contrastive_hidden_size
-        self.logit_scale_init_value = logit_scale_init_value
-
-        self.batch_size = 1
-        self.expected_num_hidden_layers = 8
-        self.is_training = False
-
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values, pixel_mask = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return (config, input_ids, attention_mask, pixel_values, pixel_mask)
-
-    def get_config(self):
-        return BridgeTowerConfig.from_text_vision_configs(
-            text_config=self.text_model_tester.get_config(),
-            vision_config=self.vision_model_tester.get_config(),
-            share_cross_modal_transformer_layers=self.share_cross_modal_transformer_layers,
-            share_link_tower_layers=self.share_link_tower_layers,
-            link_tower_type=self.link_tower_type,
-            init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder,
-            contrastive_hidden_size=self.contrastive_hidden_size,
-            logit_scale_init_value=self.logit_scale_init_value,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        pixel_values,
-        pixel_mask,
-    ):
-        model = BridgeTowerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
-        self.parent.assertEqual(
-            result["text_features"].shape,
-            (self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.hidden_size),
-        )
-        self.parent.assertEqual(
-            result["image_features"].shape,
-            (self.batch_size, self.vision_model_tester.num_image_features, self.vision_model_tester.hidden_size),
-        )
-        self.parent.assertEqual(
-            result["pooler_output"].shape,
-            (self.batch_size, self.text_model_tester.hidden_size + self.vision_model_tester.hidden_size),
-        )
-
-    def create_and_check_for_image_and_text_retrieval(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        pixel_values,
-        pixel_mask,
-    ):
-        bridgetower_itm_output_last_dimension = 2
-
-        model = BridgeTowerForImageAndTextRetrieval(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, bridgetower_itm_output_last_dimension))
-
-    def create_and_check_for_masked_language_modeling(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        pixel_values,
-        pixel_mask,
-    ):
-        model = BridgeTowerForMaskedLM(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
-
-        self.parent.assertEqual(
-            result.logits.shape,
-            (self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.vocab_size),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, attention_mask, pixel_values, pixel_mask) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "pixel_mask": pixel_mask,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class BridgeTowerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BridgeTowerModel,
-            BridgeTowerForImageAndTextRetrieval,
-            BridgeTowerForMaskedLM,
-            BridgeTowerForContrastiveLearning,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = {"feature-extraction": BridgeTowerModel} if is_torch_available() else {}
-
-    is_training = False
-    test_headmasking = False
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    has_attentions = False
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        pass
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_model_parallelism(self):
-        pass
-
-    # function to extract meaningful tensor from output per different model_class
-    def extract_output(self, outputs, model_class):
-        return outputs["pooler_output"] if model_class == "BridgeTowerModel" else outputs["logits"]
-
-    def setUp(self):
-        self.model_tester = BridgeTowerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=99)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_and_text_retrieval(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_and_text_retrieval(*config_and_inputs)
-
-    def test_for_masked_language_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_language_modeling(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "BridgeTower/bridgetower-base"
-        model = BridgeTowerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_save_load_fast_init_from_base(self):
-        # Override as it is a slow test on this model
-        super().test_save_load_fast_init_from_base()
-
-    # Override as extracting meaningful tensor from output is different for BridgeTower
-    def test_save_load(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**input_dict)
-
-            out_2 = self.extract_output(outputs, model_class.__name__)
-            out_2 = out_2.cpu().numpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-                model.to(torch_device)
-                with torch.no_grad():
-                    after_outputs = model(**input_dict)
-
-                # Make sure we don't have nans
-                out_1 = self.extract_output(after_outputs, model_class.__name__)
-                out_1 = out_1.cpu().numpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    # Override this as `hidden states output` is different for BridgeTower
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states_text, hidden_states_vision, hidden_states_cross = (
-                outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-            )
-
-            expected_num_layers = self.model_tester.expected_num_hidden_layers
-            self.assertEqual(
-                sum((len(hidden_states_text), len(hidden_states_vision), len(hidden_states_cross))),
-                expected_num_layers,
-            )
-
-            seq_length = self.model_tester.text_model_tester.seq_length
-            num_image_features = self.model_tester.vision_model_tester.num_image_features
-
-            self.assertListEqual(
-                list(hidden_states_text[0].shape[-2:]),
-                [seq_length, self.model_tester.text_model_tester.hidden_size],
-            )
-            self.assertListEqual(
-                list(hidden_states_vision[0].shape),
-                [num_image_features, 1, self.model_tester.vision_model_tester.hidden_size],
-            )
-            self.assertListEqual(
-                list(hidden_states_cross[0][0].shape[-2:]),
-                [seq_length, self.model_tester.text_model_tester.hidden_size],
-            )
-            self.assertListEqual(
-                list(hidden_states_cross[0][1].shape[-2:]),
-                [num_image_features, self.model_tester.vision_model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # Override as `hidden states output` is different for BridgeTower
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = self.has_attentions
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0][0]
-        hidden_states.retain_grad()
-
-        if self.has_attentions:
-            attentions = outputs.attentions[0][0]
-            attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-
-        if self.has_attentions:
-            self.assertIsNotNone(attentions.grad)
-
-    # override as the `logit_scale` parameter initilization is different for BRIDGE TOWER
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            config.logit_scale_init_value,
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. So this test is not applicable.""")
-    def test_model_common_attributes(self):
-        pass
-
-    @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. Thus this test is not applicable.""")
-    def test_inputs_embeds(self):
-        pass
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class BridgeTowerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return (
-            BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_image_and_text_retrieval(self):
-        model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm").to(
-            torch_device
-        )
-        model.eval()
-        processor = self.default_processor
-        image = prepare_img()
-        text = "a bunch of cats laying on a tower."
-        inputs = processor(image, text, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size([1, 2])
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        self.assertTrue(outputs.logits[0, 1].item() > outputs.logits[0, 0].item())
-
-        # verify loss
-        inputs["labels"] = torch.ones(1, dtype=torch.long, device=torch_device)
-        inputs = inputs.to(torch_device)
-        with torch.no_grad():
-            outputs = model(**inputs)
-        self.assertAlmostEqual(outputs.loss.item(), 0.5108, places=4)
-
-    @slow
-    def test_masked_language_modeling(self):
-        model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm").to(torch_device)
-        model.eval()
-        processor = self.default_processor
-        image = prepare_img()
-        text = "a bunch of <mask> laying on a tower."
-        inputs = processor(image, text, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size([1, 11, 50265])
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        # verify predicted word
-        predicted_id = outputs.logits.argmax(dim=-1).squeeze(0).tolist()[4]
-        self.assertTrue(processor.decode([predicted_id]) == " cats")
-
-        # verify loss
-        inputs["labels"] = inputs["input_ids"].clone()
-        inputs = inputs.to(torch_device)
-        with torch.no_grad():
-            outputs = model(**inputs)
-        self.assertAlmostEqual(outputs.loss.item(), 5.7373, places=4)
-
-    @slow
-    def test_constrastive_learning(self):
-        model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc").to(
-            torch_device
-        )
-        model.eval()
-        processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
-        image = prepare_img()
-        text = "a bunch of cats laying on a tower."
-        inputs = processor(image, text, padding=True, return_tensors="pt").to(torch_device)
-        with torch.no_grad():
-            outputs = model(**inputs, output_hidden_states=True, return_loss=True)
-
-        # verify the logits
-        expected_shape = torch.Size([1, 3, 512])
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-
-@slow
-@require_torch
-class BridgeTowerModelTrainingTest(unittest.TestCase):
-    all_training_supported_model_classes = (
-        (BridgeTowerForImageAndTextRetrieval, BridgeTowerForMaskedLM, BridgeTowerForContrastiveLearning)
-        if is_torch_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = BridgeTowerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=99)
-
-    def _prepare_inputs_for_training(self, model_class):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if model_class == BridgeTowerForMaskedLM:
-            inputs_dict["labels"] = inputs_dict["input_ids"]
-        elif model_class == BridgeTowerForImageAndTextRetrieval:
-            inputs_dict["labels"] = ids_tensor([1], 2)
-        elif model_class == BridgeTowerForContrastiveLearning:
-            inputs_dict["return_loss"] = True
-        return config, inputs_dict
-
-    def _get_non_used_layer_names(self, model_class):
-        non_used_layer_names = ["text_model.pooler"]
-        if model_class == BridgeTowerForMaskedLM:
-            non_used_layer_names = non_used_layer_names + [
-                # This number `1` actually depends on the number of layers in `cross_modal_image_layers` (by minus 1)
-                "cross_modal_image_layers.1",
-                "cross_modal_image_pooler",
-                "cross_modal_text_pooler",
-            ]
-        return non_used_layer_names
-
-    def _is_layer_used(self, model_class, layer_name):
-        non_used_layer_names = self._get_non_used_layer_names(model_class)
-        for non_used_layer_name in non_used_layer_names:
-            if non_used_layer_name in layer_name:
-                return False
-        return True
-
-    def test_training(self):
-        for model_class in self.all_training_supported_model_classes:
-            config, inputs_dict = self._prepare_inputs_for_training(model_class)
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-
-            loss = model(**inputs_dict).loss
-            loss.backward()
-
-            # verify the gradients of used layers' weight are not None
-            for name, param in model.named_parameters():
-                if self._is_layer_used(model_class, name):
-                    self.assertIsNotNone(param.grad, f"Gradients should not be None - got {param.grad} for {name}")
-
diff --git a/tests/transformers/tests/models/clip/__init__.py b/tests/transformers/tests/models/clip/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/clip/test_modeling_clip.py b/tests/transformers/tests/models/clip/test_modeling_clip.py
deleted file mode 100644
index 3a29612124..0000000000
--- a/tests/transformers/tests/models/clip/test_modeling_clip.py
+++ /dev/null
@@ -1,849 +0,0 @@
-
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch CLIP model. """
-
-
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-import transformers
-from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
-from transformers.testing_utils import (
-    is_flax_available,
-    is_pt_flax_cross_test,
-    require_torch,
-    require_vision,
-    slow,
-    torch_device,
-)
-from transformers.utils import is_torch_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import (
-        CLIPForImageClassification,
-        CLIPModel,
-        CLIPTextModel,
-        CLIPTextModelWithProjection,
-        CLIPVisionModel,
-        CLIPVisionModelWithProjection,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import CLIPProcessor
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
-class CLIPVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return CLIPVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = CLIPVisionModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_projection(self, config, pixel_values):
-        model = CLIPVisionModelWithProjection(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class CLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (CLIPVisionModel, CLIPVisionModelWithProjection) if is_torch_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = CLIPVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="CLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_projection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_with_projection_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPVisionModelWithProjection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertTrue(hasattr(model, "visual_projection"))
-
-
-class CLIPTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :start_index] = 1
-                input_mask[batch_idx, start_index:] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return CLIPTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = CLIPTextModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
-        model = CLIPTextModelWithProjection(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_torch_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_head_masking = False
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = CLIPTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_projection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="CLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="CLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_with_projection_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPTextModelWithProjection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertTrue(hasattr(model, "text_projection"))
-
-
-class CLIPModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return CLIPConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = CLIPModel(config).to(torch_device).eval()
-        with torch.no_grad():
-            result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class CLIPModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPModel,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": CLIPModel, "image-feature-extraction": CLIPVisionModel} if is_torch_available() else {}
-    )
-    fx_compatible = True
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = CLIPModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="CLIPModel does not have input/output embeddings")
-    def test_model_common_attributes(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for CLIP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            return
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        configs_no_init.return_dict = False
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-
-            try:
-                input_ids = inputs_dict["input_ids"]
-                pixel_values = inputs_dict["pixel_values"]  # CLIP needs pixel_values
-                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                try:
-                    torch.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-            model.to(torch_device)
-            model.eval()
-
-            loaded_model.to(torch_device)
-            loaded_model.eval()
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
-
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                p2 = loaded_model_state_dict[layer_name]
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save CLIPConfig and check if we can load CLIPVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = CLIPVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save CLIPConfig and check if we can load CLIPTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = CLIPTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    return
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
-
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load corresponding PyTorch class
-                pt_model = model_class(config).eval()
-
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    # no flax model exists for this class
-                    return
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class CLIPForImageClassificationModelTester(CLIPModelTester):
-    def __init__(self, parent):
-        super().__init__(parent)
-        self.batch_size = self.vision_model_tester.batch_size
-        self.num_hidden_layers = self.vision_model_tester.num_hidden_layers
-        self.hidden_size = self.vision_model_tester.hidden_size
-        self.seq_length = self.vision_model_tester.seq_length
-
-    def prepare_config_and_inputs(self):
-        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class CLIPForImageClassificationModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPForImageClassification,) if is_torch_available() else ()
-    pipeline_model_mapping = {"image-classification": CLIPForImageClassification} if is_torch_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = CLIPForImageClassificationModelTester(self)
-
-    @unittest.skip(reason="CLIPForImageClassification does not support inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="CLIPForImageClassification does not support inputs_embeds")
-    def test_model_common_attributes(self):
-        pass
-
-    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="CLIP uses the same initialization scheme as the Flax original implementation")
-    def test_initialization(self):
-        pass
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_torch
-class CLIPModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPModel.from_pretrained(model_name).to(torch_device)
-        processor = CLIPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
-        ).to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
-        )
-
-        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
-
diff --git a/tests/transformers/tests/models/distilbert/__init__.py b/tests/transformers/tests/models/distilbert/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/distilbert/test_modeling_distilbert.py b/tests/transformers/tests/models/distilbert/test_modeling_distilbert.py
deleted file mode 100644
index a7bdea46b4..0000000000
--- a/tests/transformers/tests/models/distilbert/test_modeling_distilbert.py
+++ /dev/null
@@ -1,435 +0,0 @@
-
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-
-import pytest
-
-from transformers import DistilBertConfig, is_torch_available
-from transformers.testing_utils import (
-    require_flash_attn,
-    require_torch,
-    require_torch_accelerator,
-    require_torch_gpu,
-    slow,
-)
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        DistilBertForMaskedLM,
-        DistilBertForMultipleChoice,
-        DistilBertForQuestionAnswering,
-        DistilBertForSequenceClassification,
-        DistilBertForTokenClassification,
-        DistilBertModel,
-    )
-    from transformers.models.distilbert.modeling_distilbert import _create_sinusoidal_embeddings
-
-
-class DistilBertModelTester(object):
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return DistilBertConfig(
-            vocab_size=self.vocab_size,
-            dim=self.hidden_size,
-            n_layers=self.num_hidden_layers,
-            n_heads=self.num_attention_heads,
-            hidden_dim=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_distilbert_model(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DistilBertModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_distilbert_for_masked_lm(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DistilBertForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_distilbert_for_question_answering(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DistilBertForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_distilbert_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = DistilBertForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_distilbert_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = DistilBertForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_distilbert_for_multiple_choice(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = DistilBertForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            DistilBertModel,
-            DistilBertForMaskedLM,
-            DistilBertForMultipleChoice,
-            DistilBertForQuestionAnswering,
-            DistilBertForSequenceClassification,
-            DistilBertForTokenClassification,
-        )
-        if is_torch_available()
-        else None
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": DistilBertModel,
-            "fill-mask": DistilBertForMaskedLM,
-            "question-answering": DistilBertForQuestionAnswering,
-            "text-classification": DistilBertForSequenceClassification,
-            "token-classification": DistilBertForTokenClassification,
-            "zero-shot": DistilBertForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-    test_pruning = True
-    test_resize_embeddings = True
-    test_resize_position_embeddings = True
-
-    def setUp(self):
-        self.model_tester = DistilBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_distilbert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
-
-    def test_distilbert_model_with_sinusoidal_encodings(self):
-        config = DistilBertConfig(sinusoidal_pos_embds=True)
-        model = DistilBertModel(config=config)
-        sinusoidal_pos_embds = torch.empty((config.max_position_embeddings, config.dim), dtype=torch.float32)
-        _create_sinusoidal_embeddings(config.max_position_embeddings, config.dim, sinusoidal_pos_embds)
-        self.model_tester.parent.assertTrue(
-            torch.equal(model.embeddings.position_embeddings.weight, sinusoidal_pos_embds)
-        )
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "distilbert-base-uncased"
-        model = DistilBertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    @require_torch_accelerator
-    def test_torchscript_device_change(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            # BertForMultipleChoice behaves incorrectly in JIT environments.
-            if model_class == DistilBertForMultipleChoice:
-                return
-
-            config.torchscript = True
-            model = model_class(config=config)
-
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            traced_model = torch.jit.trace(
-                model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu"))
-            )
-
-            with tempfile.TemporaryDirectory() as tmp:
-                torch.jit.save(traced_model, os.path.join(tmp, "traced_model.pt"))
-                loaded = torch.jit.load(os.path.join(tmp, "traced_model.pt"), map_location=torch_device)
-                loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))
-
-    # Because DistilBertForMultipleChoice requires inputs with different shapes we need to override this test.
-    @require_flash_attn
-    @require_torch_accelerator
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_inference_equivalence(self):
-        import torch
-
-        for model_class in self.all_model_classes:
-            dummy_input = torch.LongTensor(
-                [
-                    [1, 2, 3, 4],
-                    [1, 2, 8, 9],
-                    [1, 2, 11, 12],
-                    [1, 2, 13, 14],
-                ]
-            ).to(torch_device)
-            dummy_attention_mask = torch.LongTensor(
-                [
-                    [0, 1, 1, 1],
-                    [0, 1, 1, 1],
-                    [0, 1, 1, 1],
-                    [0, 1, 1, 1],
-                ]
-            ).to(torch_device)
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-                )
-                model_fa.to(torch_device)
-
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
-                model.to(torch_device)
-
-                logits = model(dummy_input, output_hidden_states=True).hidden_states[-1]
-                logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1]
-
-                self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2))
-
-                output_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True)
-                logits_fa = output_fa.hidden_states[-1]
-
-                output = model(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True)
-                logits = output.hidden_states[-1]
-
-                self.assertTrue(torch.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2))
-
-    # Because DistilBertForMultipleChoice requires inputs with different shapes we need to override this test.
-    @require_flash_attn
-    @require_torch_accelerator
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        import torch
-
-        for model_class in self.all_model_classes:
-            dummy_input = torch.LongTensor(
-                [
-                    [1, 2, 3, 4],
-                    [1, 2, 8, 9],
-                    [1, 2, 11, 12],
-                    [1, 2, 13, 14],
-                ]
-            ).to(torch_device)
-            dummy_attention_mask = torch.LongTensor(
-                [
-                    [0, 1, 1, 1],
-                    [0, 1, 1, 1],
-                    [0, 1, 1, 1],
-                    [0, 1, 1, 1],
-                ]
-            ).to(torch_device)
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-                )
-                model_fa.to(torch_device)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.bfloat16,
-                )
-                model.to(torch_device)
-
-                logits = model(dummy_input, output_hidden_states=True).hidden_states[-1]
-                logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1]
-
-                self.assertTrue(torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2))
-
-                output_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True)
-                logits_fa = output_fa.hidden_states[-1]
-
-                output = model(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True)
-                logits = output.hidden_states[-1]
-
-                self.assertTrue(torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2))
-
-
-@require_torch
-class DistilBertModelIntergrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = DistilBertModel.from_pretrained("distilbert-base-uncased")
-        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with torch.no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = torch.Size((1, 11, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = torch.tensor(
-            [[[-0.1639, 0.3299, 0.1648], [-0.1746, 0.3289, 0.1710], [-0.1884, 0.3357, 0.1810]]]
-        )
-
-        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
-
diff --git a/tests/transformers/tests/models/falcon/__init__.py b/tests/transformers/tests/models/falcon/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/falcon/test_modeling_falcon.py b/tests/transformers/tests/models/falcon/test_modeling_falcon.py
deleted file mode 100644
index ab64987629..0000000000
--- a/tests/transformers/tests/models/falcon/test_modeling_falcon.py
+++ /dev/null
@@ -1,696 +0,0 @@
-
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Falcon model. """
-
-
-import tempfile
-import unittest
-
-from parameterized import parameterized
-
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    FalconConfig,
-    is_torch_available,
-    set_seed,
-)
-from transformers.testing_utils import (
-    is_flaky,
-    require_bitsandbytes,
-    require_torch,
-    require_torch_sdpa,
-    slow,
-    torch_device,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        FalconForCausalLM,
-        FalconForQuestionAnswering,
-        FalconForSequenceClassification,
-        FalconForTokenClassification,
-        FalconModel,
-    )
-    from transformers.models.falcon.modeling_falcon import (
-        FalconDynamicNTKScalingRotaryEmbedding,
-        FalconLinearScalingRotaryEmbedding,
-        FalconRotaryEmbedding,
-    )
-
-
-class FalconModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return FalconConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=1,
-            new_decoder_architecture=True,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = FalconModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = FalconModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = FalconForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = FalconForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FalconModel,
-            FalconForCausalLM,
-            FalconForSequenceClassification,
-            FalconForTokenClassification,
-            FalconForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (FalconForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": FalconModel,
-            "question-answering": FalconForQuestionAnswering,
-            "text-classification": FalconForSequenceClassification,
-            "text-generation": FalconForCausalLM,
-            "token-classification": FalconForTokenClassification,
-            "zero-shot": FalconForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = FalconModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FalconConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_position_embedding_types(self):
-        config, *inputs = self.model_tester.prepare_config_and_inputs()
-        for alibi in [True, False]:
-            config.alibi = alibi
-            self.model_tester.create_and_check_model(config, *inputs)
-
-    def test_falcon_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = FalconForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_falcon_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = FalconForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_falcon_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(torch.float)
-        model = FalconForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_past_key_values_format(self):
-        # Falcon can have different numbers of KV-heads than the number of query heads, so we need
-        # to override this test to use the right head counts.
-        for model_class in self.all_generative_model_classes:
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # If it doesn't support cache, pass the test
-            if not hasattr(config, "use_cache"):
-                return
-
-            model = model_class(config).to(torch_device)
-            if "use_cache" not in inputs:
-                inputs["use_cache"] = True
-            outputs = model(**inputs)
-
-            # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
-            if "past_key_values" not in outputs:
-                return
-
-            num_hidden_layers = (
-                getattr(config, "decoder_layers", None)
-                or getattr(config, "num_decoder_layers", None)
-                or config.num_hidden_layers
-            )
-            num_attention_heads = getattr(config, "num_kv_heads", config.num_attention_heads)
-            embed_dim = getattr(config, "d_model", config.hidden_size)
-            per_head_embed_dim = embed_dim // num_attention_heads
-
-            past_kv = outputs["past_key_values"]
-            self.assertEqual(len(past_kv), num_hidden_layers)
-
-            batch_size, seq_length = inputs["input_ids"].shape
-            for i in range(num_hidden_layers):
-                if config.new_decoder_architecture:
-                    num_attention_heads = config.num_attention_heads
-                elif config.multi_query:
-                    num_attention_heads = 1
-                self.assertEqual(len(past_kv[0]), 2)  # K V for the decoder = 2
-                self.assertEqual(
-                    past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                )
-                self.assertEqual(
-                    past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                )
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_rope_scaling_from_config with Llama->Falcon
-    def test_model_rope_scaling_from_config(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = FalconModel(config)
-        original_model.to(torch_device)
-        original_model.eval()
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = FalconModel(config)
-        scaled_model.to(torch_device)
-        scaled_model.eval()
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
-        else:
-            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
-
-    def test_model_rope_scaling(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        hidden_size = config.hidden_size
-        num_heads = config.num_attention_heads
-        head_dim = hidden_size // num_heads
-        scaling_factor = 10
-        short_input_length = 10
-        long_input_length = int(config.max_position_embeddings * 1.5)
-
-        # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
-
-        # Sanity check original RoPE
-        original_rope = FalconRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-        ).to(torch_device)
-        original_cos_short, original_sin_short = original_rope(x, short_input_length)
-        original_cos_long, original_sin_long = original_rope(x, long_input_length)
-        torch.testing.assert_close(original_cos_short, original_cos_long[:short_input_length, :])
-        torch.testing.assert_close(original_sin_short, original_sin_long[:short_input_length, :])
-
-        # Sanity check linear RoPE scaling
-        # New position "x" should match original position with index "x/scaling_factor"
-        linear_scaling_rope = FalconLinearScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-            scaling_factor=scaling_factor,
-        ).to(torch_device)
-        linear_cos_short, linear_sin_short = linear_scaling_rope(x, short_input_length)
-        linear_cos_long, linear_sin_long = linear_scaling_rope(x, long_input_length)
-        torch.testing.assert_close(linear_cos_short, linear_cos_long[:short_input_length, :])
-        torch.testing.assert_close(linear_sin_short, linear_sin_long[:short_input_length, :])
-        for new_position in range(0, long_input_length, scaling_factor):
-            original_position = int(new_position // scaling_factor)
-            torch.testing.assert_close(linear_cos_long[new_position, :], original_cos_long[original_position, :])
-            torch.testing.assert_close(linear_sin_long[new_position, :], original_sin_long[original_position, :])
-
-        # Sanity check Dynamic NTK RoPE scaling
-        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
-        # with scaling_factor (or that `inv_freq` decreases)
-        ntk_scaling_rope = FalconDynamicNTKScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-            scaling_factor=scaling_factor,
-        ).to(torch_device)
-        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, short_input_length)
-        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, long_input_length)
-        torch.testing.assert_close(ntk_cos_short, original_cos_short)
-        torch.testing.assert_close(ntk_sin_short, original_sin_short)
-        with self.assertRaises(AssertionError):
-            torch.testing.assert_close(ntk_cos_long, original_cos_long)
-        with self.assertRaises(AssertionError):
-            torch.testing.assert_close(ntk_sin_long, original_sin_long)
-        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
-
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        max_new_tokens = 30
-
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model_sdpa = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                    attn_implementation="eager",
-                ).to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                # NOTE: This check is disabled for Falcon as the non-SDPA/SDPA implementation is in the same class (legacy reason).
-                # for name, submodule in model_eager.named_modules():
-                #     if "SdpaAttention" in submodule.__class__.__name__:
-                #         raise ValueError("The eager model should not have SDPA attention layers")
-
-                # has_sdpa = False
-                # for name, submodule in model_sdpa.named_modules():
-                #     if "SdpaAttention" in submodule.__class__.__name__:
-                #         has_sdpa = True
-                #         break
-                # if not has_sdpa:
-                #     raise ValueError("The SDPA model should have SDPA attention layers")
-
-                # Just test that a large cache works as expected
-                res_eager = model_eager.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                res_sdpa = model_sdpa.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(res_eager, res_sdpa))
-
-
-@require_torch
-class FalconLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_falcon(self):
-        tokenizer = AutoTokenizer.from_pretrained("Rocketknight1/falcon-rw-1b")
-        model = FalconForCausalLM.from_pretrained("Rocketknight1/falcon-rw-1b")
-        model.eval()
-        model.to(torch_device)
-        inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
-
-        EXPECTED_OUTPUT = (
-            "My favorite food is pizza. I love it so much that I have a pizza party every year for my birthday."
-        )
-
-        output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=19)
-        output_str = tokenizer.batch_decode(output_ids)[0]
-
-        self.assertEqual(output_str, EXPECTED_OUTPUT)
-
-    @slow
-    def test_lm_generation_big_models(self):
-        # The big models are way too big for the CI, so we use tiny random models that resemble their
-        # architectures but with much smaller and fewer layers
-        for repo in ["Rocketknight1/tiny-random-falcon-7b", "Rocketknight1/tiny-random-falcon-40b"]:
-            tokenizer = AutoTokenizer.from_pretrained(repo)
-            model = FalconForCausalLM.from_pretrained(repo)
-            model.eval()
-            model.to(torch_device)
-            inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
-
-            # We just test that these run without errors - the models are randomly initialized
-            # and so the actual text outputs will be garbage
-            model.generate(**inputs, do_sample=False, max_new_tokens=4)
-            model.generate(**inputs, do_sample=True, max_new_tokens=4)
-            model.generate(**inputs, num_beams=2, max_new_tokens=4)
-
-    @slow
-    def test_lm_generation_use_cache(self):
-        # The big models are way too big for the CI, so we use tiny random models that resemble their
-        # architectures but with much smaller and fewer layers
-        with torch.no_grad():
-            for repo in [
-                "Rocketknight1/falcon-rw-1b",
-                "Rocketknight1/tiny-random-falcon-7b",
-                "Rocketknight1/tiny-random-falcon-40b",
-            ]:
-                tokenizer = AutoTokenizer.from_pretrained(repo)
-                model = FalconForCausalLM.from_pretrained(repo)
-                model.eval()
-                model.to(device=torch_device)
-                inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
-
-                # Test results are the same with and without cache
-                outputs_no_cache = model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
-                outputs_cache = model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=True)
-                self.assertTrue((outputs_cache - outputs_no_cache).sum().item() == 0)
-
-    @require_bitsandbytes
-    @slow
-    def test_batched_generation(self):
-        tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelForCausalLM.from_pretrained(
-            "tiiuae/falcon-7b",
-            device_map="auto",
-            load_in_4bit=True,
-        )
-
-        test_text = "A sequence: 1, 2"  # should generate the rest of the sequence
-
-        unpadded_inputs = tokenizer([test_text], return_tensors="pt").to("cuda:0")
-        unpadded_gen_out = model.generate(**unpadded_inputs, max_new_tokens=20)
-        unpadded_gen_text = tokenizer.batch_decode(unpadded_gen_out, skip_special_tokens=True)
-
-        dummy_text = "This is a longer text " * 2  # forces left-padding on `test_text`
-        padded_inputs = tokenizer([test_text, dummy_text], return_tensors="pt", padding=True).to("cuda:0")
-        padded_gen_out = model.generate(**padded_inputs, max_new_tokens=20)
-        padded_gen_text = tokenizer.batch_decode(padded_gen_out, skip_special_tokens=True)
-
-        expected_output = "A sequence: 1, 2, 3, 4, 5, 6, 7, 8, "
-        self.assertLess(unpadded_inputs.input_ids.shape[-1], padded_inputs.input_ids.shape[-1])  # left-padding exists
-        self.assertEqual(unpadded_gen_text[0], expected_output)
-        self.assertEqual(padded_gen_text[0], expected_output)
-
-    @slow
-    @require_torch_sdpa
-    def test_falcon_alibi_sdpa_matches_eager(self):
-        input_ids = torch.randint(0, 1000, (5, 20))
-
-        config = FalconConfig(
-            vocab_size=1000,
-            hidden_size=64,
-            num_hidden_layers=3,
-            num_attention_heads=4,
-            new_decoder_architecture=True,
-            alibi=True,
-        )
-
-        falcon = FalconForCausalLM(config)
-        falcon = falcon.eval()
-
-        with torch.no_grad():
-            # output_attentions=True dispatches to eager path
-            falcon_output_eager = falcon(input_ids, output_attentions=True)[0]
-            falcon_output_sdpa = falcon(input_ids)[0]
-
-        self.assertTrue(torch.allclose(falcon_output_eager, falcon_output_sdpa, atol=1e-3))
-
diff --git a/tests/transformers/tests/models/gpt2/__init__.py b/tests/transformers/tests/models/gpt2/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py b/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py
deleted file mode 100644
index fe2cbf2a80..0000000000
--- a/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py
+++ /dev/null
@@ -1,919 +0,0 @@
-
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import datetime
-import gc
-import math
-import unittest
-
-import pytest
-
-from transformers import GPT2Config, is_torch_available
-from transformers.testing_utils import (
-    backend_empty_cache,
-    require_flash_attn,
-    require_torch,
-    require_torch_gpu,
-    slow,
-    torch_device,
-)
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-torch_device = "hpu"
-
-
-adapt_transformers_to_gaudi()
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        GPT2DoubleHeadsModel,
-        GPT2ForQuestionAnswering,
-        GPT2ForSequenceClassification,
-        GPT2ForTokenClassification,
-        GPT2LMHeadModel,
-        GPT2Model,
-        GPT2Tokenizer,
-    )
-
-
-class GPT2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def get_large_model_config(self):
-        return GPT2Config.from_pretrained("openai-community/gpt2")
-
-    def prepare_config_and_inputs(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config(
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        return GPT2Config(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            n_inner=self.intermediate_size,
-            activation_function=self.hidden_act,
-            resid_pdrop=self.hidden_dropout_prob,
-            attn_pdrop=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPT2Model(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPT2Model(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gpt2_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPT2Model(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # create attention mask
-        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = torch.cat(
-            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gpt2_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPT2Model(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPT2LMHeadModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = GPT2LMHeadModel(config)
-        model.to(torch_device)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def create_and_check_double_lm_head_model(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-    ):
-        model = GPT2DoubleHeadsModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "mc_token_ids": mc_token_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-            "labels": multiple_choice_inputs_ids,
-        }
-
-        result = model(**inputs)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
-        )
-        self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_gpt2_for_question_answering(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPT2ForQuestionAnswering(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_gpt2_for_sequence_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPT2ForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_gpt2_for_token_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPT2ForTokenClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_gpt2_weight_initialization(self, config, *args):
-        model = GPT2Model(config)
-        model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
-        for key in model.state_dict().keys():
-            if "c_proj" in key and "weight" in key:
-                self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
-                self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "head_mask": head_mask,
-        }
-
-        return config, inputs_dict
-
-
-@require_torch
-class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            GPT2Model,
-            GPT2LMHeadModel,
-            GPT2DoubleHeadsModel,
-            #            GPT2ForQuestionAnswering,
-            #            GPT2ForSequenceClassification,
-            #            GPT2ForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPT2Model,
-            #            "question-answering": GPT2ForQuestionAnswering,
-            #           "text-classification": GPT2ForSequenceClassification,
-            "text-generation": GPT2LMHeadModel,
-            #          "token-classification": GPT2ForTokenClassification,
-            #          "zero-shot": GPT2ForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
-    fx_compatible = True
-    test_missing_keys = False
-    test_model_parallel = True
-
-    # special case for DoubleHeads model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "GPT2DoubleHeadsModel":
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length),
-                    dtype=torch.long,
-                    device=torch_device,
-                )
-                inputs_dict["input_ids"] = inputs_dict["labels"]
-                inputs_dict["token_type_ids"] = inputs_dict["labels"]
-                inputs_dict["mc_token_ids"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_choices),
-                    dtype=torch.long,
-                    device=torch_device,
-                )
-                inputs_dict["mc_labels"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = GPT2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gpt2_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
-
-    def test_gpt2_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
-
-    def test_gpt2_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
-
-    def test_gpt2_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_past_large_inputs(*config_and_inputs)
-
-    def test_gpt2_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_gpt2_double_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
-
-    @pytest.mark.skip(reason="Question Answering Model not yet validated on Gaudi")
-    def test_gpt2_question_answering_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_for_question_answering(*config_and_inputs)
-
-    @pytest.mark.skip(reason="Sequence Classification Model not yet validated on Gaudi")
-    def test_gpt2_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs)
-
-    @pytest.mark.skip(reason="Token classification Model not yet validated on Gaudi")
-    def test_gpt2_token_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_for_token_classification(*config_and_inputs)
-
-    def test_gpt2_gradient_checkpointing(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    def test_gpt2_scale_attn_by_inverse_layer_idx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True)
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
-
-    def test_gpt2_reorder_and_upcast_attn(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True)
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
-
-    def test_gpt2_weight_initialization(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_weight_initialization(*config_and_inputs)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @slow
-    def test_batch_generation(self):
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        model.to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        tokenizer.padding_side = "left"
-
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
-        input_ids = inputs["input_ids"].to(torch_device)
-        token_type_ids = torch.cat(
-            [
-                input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
-                input_ids.new_full((input_ids.shape[0], 1), 500),
-            ],
-            dim=-1,
-        )
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-        )
-
-        outputs_tt = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-            token_type_ids=token_type_ids,
-            ignore_eos=True,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
-            "Today, I'm going to be doing a lot of research on this. I",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_batch_generation_2heads(self):
-        model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")
-        model.to(torch_device)
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        tokenizer.padding_side = "left"
-
-        # This tokenizer has no pad token, so we have to set it in some way
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
-        input_ids = inputs["input_ids"].to(torch_device)
-        token_type_ids = torch.cat(
-            [
-                input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
-                input_ids.new_full((input_ids.shape[0], 1), 500),
-            ],
-            dim=-1,
-        )
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-        )
-
-        outputs_tt = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-            token_type_ids=token_type_ids,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
-            "Today, I'm going to be doing a lot of research on this. I",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai-community/gpt2"
-        model = GPT2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class GPT2ModelLanguageGenerationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def _test_lm_generate_gpt2_helper(
-        self,
-        gradient_checkpointing=False,
-        reorder_and_upcast_attn=False,
-        scale_attn_by_inverse_layer_idx=False,
-        verify_outputs=True,
-    ):
-        model = GPT2LMHeadModel.from_pretrained(
-            "openai-community/gpt2",
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-        )
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-        else:
-            model.gradient_checkpointing_disable()
-        model.to(torch_device)
-
-        # The dog
-        input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)
-
-        # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
-        expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290,]  # fmt: skip
-        output_ids = model.generate(input_ids, do_sample=False, ignore_eos=True)
-        if verify_outputs:
-            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
-
-    @slow
-    def test_lm_generate_gpt2(self):
-        self._test_lm_generate_gpt2_helper()
-
-    @slow
-    def test_lm_generate_gpt2_with_gradient_checkpointing(self):
-        self._test_lm_generate_gpt2_helper(gradient_checkpointing=True)
-
-    @slow
-    def test_lm_generate_gpt2_with_reorder_and_upcast_attn(self):
-        self._test_lm_generate_gpt2_helper(reorder_and_upcast_attn=True)
-
-    @slow
-    def test_lm_generate_gpt2_with_scale_attn_by_inverse_layer_idx(self):
-        self._test_lm_generate_gpt2_helper(scale_attn_by_inverse_layer_idx=True, verify_outputs=False)
-
-    @slow
-    def test_gpt2_sample(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
-        input_ids = tokenized.input_ids.to(torch_device)
-        output_ids = model.generate(input_ids, do_sample=True, ignore_eos=True)
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        token_type_ids = tokenized.token_type_ids.to(torch_device)
-        output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
-        output_seq_tt = model.generate(
-            input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
-        )
-        output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
-        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
-
-        EXPECTED_OUTPUT_STR = (
-            "Today is a nice day and if you don't know anything about the state of play during your holiday"
-        )
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-        self.assertTrue(
-            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
-        )  # token_type_ids should change output
-
-    @slow
-    def test_gpt2_sample_max_time(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
-        input_ids = tokenized.input_ids.to(torch_device)
-
-        MAX_TIME = 0.5
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=None, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-    @slow
-    def test_contrastive_search_gpt2(self):
-        article = (
-            "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-            "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
-        )
-
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large")
-        gpt2_model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large").to(torch_device)
-        input_ids = gpt2_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        outputs = gpt2_model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
-
-        generated_text = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-                "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, "
-                "United Kingdom\n\nGoogle has a lot of data on its users and uses it to improve its products, such as "
-                "Google Now, which helps users find the information they're looking for on the web. But the company "
-                "is not the only one to collect data on its users. Facebook, for example, has its own facial "
-                "recognition technology, as well as a database of millions of photos that it uses to personalize its "
-                "News Feed.\n\nFacebook's use of data is a hot topic in the tech industry, with privacy advocates "
-                "concerned about the company's ability to keep users' information private. In a blog post last "
-                'year, Facebook CEO Mark Zuckerberg said his company would "do our best to be transparent about our '
-                'data use and how we use it."\n\n"We have made it clear that we do not sell or share your data with '
-                'third parties," Zuckerberg wrote. "If you have questions or concerns, please reach out to us at '
-                'privacy@facebook.com."\n\nGoogle declined to comment on the privacy implications of its use of data, '
-                "but said in a statement to The Associated Press that"
-            ],
-        )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_left(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = GPT2LMHeadModel.from_pretrained("gpt2", torch_dtype=torch.float16).to(0)
-
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "left"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = GPT2LMHeadModel.from_pretrained(
-            "gpt2", device_map={"": 0}, attn_implementation="flash_attention_2", torch_dtype=torch.float16
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        expected_output = [
-            "<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>hi, who was born in the city of Kolkata, was a member of the Kolkata",
-            "Hello this is a very long sentence. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry",
-        ]
-
-        self.assertListEqual(output_native, output_fa_2)
-        self.assertListEqual(output_native, expected_output)
-
diff --git a/tests/transformers/tests/models/gpt_neox/__init__.py b/tests/transformers/tests/models/gpt_neox/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py
deleted file mode 100644
index 07512c3760..0000000000
--- a/tests/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ /dev/null
@@ -1,442 +0,0 @@
-
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch GPTNeoX model. """
-
-
-import unittest
-
-from parameterized import parameterized
-
-from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed
-from transformers.testing_utils import require_torch, slow, torch_device
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-torch_device = "hpu"
-
-
-adapt_transformers_to_gaudi()
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        GPTNeoXForCausalLM,
-        GPTNeoXForQuestionAnswering,
-        GPTNeoXForSequenceClassification,
-        GPTNeoXForTokenClassification,
-        GPTNeoXModel,
-    )
-    from transformers.models.gpt_neox.modeling_gpt_neox import (
-        GPTNeoXDynamicNTKScalingRotaryEmbedding,
-        GPTNeoXLinearScalingRotaryEmbedding,
-        GPTNeoXRotaryEmbedding,
-    )
-
-
-class GPTNeoXModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.pad_token_id = vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_labels = None
-        if self.use_labels:
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, token_labels
-
-    def get_config(self):
-        return GPTNeoXConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config, input_ids, input_mask, token_labels = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-
-        return config, input_ids, input_mask, token_labels
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = GPTNeoXModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        _ = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(self, config, input_ids, input_mask):
-        config.add_cross_attention = True
-        model = GPTNeoXModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(self, config, input_ids, input_mask, token_labels):
-        model = GPTNeoXForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_question_answering(self, config, input_ids, input_mask, token_labels):
-        config.num_labels = self.num_labels
-        model = GPTNeoXForQuestionAnswering(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(self, config, input_ids, input_mask, token_labels):
-        config.num_labels = self.num_labels
-        model = GPTNeoXForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(self, config, input_ids, input_mask, token_labels):
-        config.num_labels = self.num_labels
-        model = GPTNeoXForTokenClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, input_ids, input_mask):
-        config.is_decoder = True
-        model = GPTNeoXForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask, output_hidden_states=True)
-        output_from_no_past = output_from_no_past["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask, token_labels = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            GPTNeoXModel,
-            GPTNeoXForCausalLM,
-            GPTNeoXForQuestionAnswering,
-            GPTNeoXForSequenceClassification,
-            GPTNeoXForTokenClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (GPTNeoXForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPTNeoXModel,
-            "question-answering": GPTNeoXForQuestionAnswering,
-            "text-classification": GPTNeoXForSequenceClassification,
-            "text-generation": GPTNeoXForCausalLM,
-            "token-classification": GPTNeoXForTokenClassification,
-            "zero-shot": GPTNeoXForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_pruning = False
-    test_missing_keys = False
-    test_model_parallel = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = GPTNeoXModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTNeoXConfig, hidden_size=64, num_attention_heads=8)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(config, input_ids, input_mask)
-
-    def test_model_as_decoder(self):
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask)
-
-    def test_decoder_model_past_large_inputs(self):
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(config, input_ids, input_mask)
-
-    def test_model_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_model_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_model_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_model_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_rope_scaling_from_config with Llama->GPTNeoX
-    def test_model_rope_scaling_from_config(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = GPTNeoXModel(config)
-        original_model.to(torch_device)
-        original_model.eval()
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = GPTNeoXModel(config)
-        scaled_model.to(torch_device)
-        scaled_model.eval()
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
-        else:
-            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
-
-    # Copied from tests.models.falcon.test_modeling_falcon.FalconModelTest.test_model_rope_scaling with Falcon->GPTNeoX, rope_theta->rotary_emb_base
-    def test_model_rope_scaling(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        hidden_size = config.hidden_size
-        num_heads = config.num_attention_heads
-        head_dim = hidden_size // num_heads
-        scaling_factor = 10
-        short_input_length = 10
-        long_input_length = int(config.max_position_embeddings * 1.5)
-
-        # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
-
-        # Sanity check original RoPE
-        original_rope = GPTNeoXRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rotary_emb_base,
-        ).to(torch_device)
-        original_cos_short, original_sin_short = original_rope(x, short_input_length)
-        original_cos_long, original_sin_long = original_rope(x, long_input_length)
-        torch.testing.assert_close(original_cos_short, original_cos_long[:short_input_length, :])
-        torch.testing.assert_close(original_sin_short, original_sin_long[:short_input_length, :])
-
-        # Sanity check linear RoPE scaling
-        # New position "x" should match original position with index "x/scaling_factor"
-        linear_scaling_rope = GPTNeoXLinearScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rotary_emb_base,
-            scaling_factor=scaling_factor,
-        ).to(torch_device)
-        linear_cos_short, linear_sin_short = linear_scaling_rope(x, short_input_length)
-        linear_cos_long, linear_sin_long = linear_scaling_rope(x, long_input_length)
-        torch.testing.assert_close(linear_cos_short, linear_cos_long[:short_input_length, :])
-        torch.testing.assert_close(linear_sin_short, linear_sin_long[:short_input_length, :])
-        for new_position in range(0, long_input_length, scaling_factor):
-            original_position = int(new_position // scaling_factor)
-            torch.testing.assert_close(linear_cos_long[new_position, :], original_cos_long[original_position, :])
-            torch.testing.assert_close(linear_sin_long[new_position, :], original_sin_long[original_position, :])
-
-        # Sanity check Dynamic NTK RoPE scaling
-        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
-        # with scaling_factor (or that `inv_freq` decreases)
-        ntk_scaling_rope = GPTNeoXDynamicNTKScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rotary_emb_base,
-            scaling_factor=scaling_factor,
-        ).to(torch_device)
-        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, short_input_length)
-        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, long_input_length)
-        torch.testing.assert_close(ntk_cos_short, original_cos_short)
-        torch.testing.assert_close(ntk_sin_short, original_sin_short)
-        with self.assertRaises(AssertionError):
-            torch.testing.assert_close(ntk_cos_long, original_cos_long)
-        with self.assertRaises(AssertionError):
-            torch.testing.assert_close(ntk_sin_long, original_sin_long)
-        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
-
-
-@require_torch
-class GPTNeoXLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_gptneox(self):
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-deduped")
-        for checkpointing in [True, False]:
-            model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-410m-deduped")
-
-            if checkpointing:
-                model.gradient_checkpointing_enable()
-            else:
-                model.gradient_checkpointing_disable()
-            model.to(torch_device)
-
-            inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device)
-            # The hub repo. is updated on 2023-04-04, resulting in poor outputs.
-            # See: https://github.com/huggingface/transformers/pull/24193
-            expected_output = "My favorite food is a good old-fashioned, old-fashioned, old-fashioned.\n\nI'm not sure"
-
-            output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
-            output_str = tokenizer.batch_decode(output_ids)[0]
-
-            self.assertEqual(output_str, expected_output)
-
-    def pythia_integration_test(self):
-        model_name_or_path = "EleutherAI/pythia-70m"
-        model = GPTNeoXForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16).to(torch_device)
-        EXPECTED_LOGITS = torch.tensor([1069.0000,  228.7500, 1072.0000, 1072.0000, 1069.0000, 1068.0000, 1068.0000, 1071.0000, 1071.0000, 1071.0000, 1073.0000, 1070.0000, 1071.0000, 1075.0000, 1073.0000, 1075.0000, 1074.0000, 1069.0000, 1072.0000, 1071.0000, 1071.0000, 1071.0000, 1070.0000, 1069.0000, 1069.0000, 1069.0000, 1070.0000, 1075.0000, 1073.0000, 1074.0000])  # fmt: skip
-        input_ids = [29, 93, 303, 64, 5478, 49651, 10394, 187, 34, 12939, 875]
-        # alternative: tokenizer('<|im_start|>system\nA chat between')
-        input_ids = torch.as_tensor(input_ids)[None].to(torch_device)
-        outputs = model(input_ids)["logits"][:, -1][0, :30]
-        self.assertTrue(torch.allclose(EXPECTED_LOGITS, outputs, atol=1e-5))
-
diff --git a/tests/transformers/tests/models/gptj/__init__.py b/tests/transformers/tests/models/gptj/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/gptj/test_modeling_gptj.py b/tests/transformers/tests/models/gptj/test_modeling_gptj.py
deleted file mode 100644
index f221002b42..0000000000
--- a/tests/transformers/tests/models/gptj/test_modeling_gptj.py
+++ /dev/null
@@ -1,703 +0,0 @@
-
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import datetime
-import unittest
-
-import pytest
-
-from transformers import BitsAndBytesConfig, GPTJConfig, is_torch_available
-from transformers.testing_utils import (
-    require_bitsandbytes,
-    require_flash_attn,
-    require_torch,
-    require_torch_gpu,
-    slow,
-    tooslow,
-    torch_device,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        AutoTokenizer,
-        GPTJForCausalLM,
-        GPTJForQuestionAnswering,
-        GPTJForSequenceClassification,
-        GPTJModel,
-    )
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12
-else:
-    is_torch_greater_or_equal_than_1_12 = False
-
-
-class GPTJModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        rotary_dim=4,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.rotary_dim = rotary_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def get_large_model_config(self):
-        return GPTJConfig.from_pretrained("EleutherAI/gpt-j-6B")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return GPTJConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            rotary_dim=self.rotary_dim,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_gptj_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTJModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_gptj_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTJModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gptj_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPTJModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # create attention mask
-        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = torch.cat(
-            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gptj_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPTJModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTJForCausalLM(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = GPTJForCausalLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-        model.to(torch_device)
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
-
-        return config, inputs_dict
-
-
-@require_torch
-class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (GPTJModel, GPTJForCausalLM, GPTJForSequenceClassification, GPTJForQuestionAnswering)
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (GPTJForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPTJModel,
-            "question-answering": GPTJForQuestionAnswering,
-            "text-classification": GPTJForSequenceClassification,
-            "text-generation": GPTJForCausalLM,
-            "zero-shot": GPTJForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-    test_pruning = False
-    test_missing_keys = False
-    test_model_parallel = False
-    test_head_masking = False
-
-    @unittest.skipIf(
-        not is_torch_greater_or_equal_than_1_12, reason="PR #22069 made changes that require torch v1.12+."
-    )
-    @pytest.mark.skip("Skipped for Gaudi")
-    def test_torch_fx(self):
-        super().test_torch_fx()
-
-    @unittest.skipIf(
-        not is_torch_greater_or_equal_than_1_12, reason="PR #22069 made changes that require torch v1.12+."
-    )
-    @pytest.mark.skip("Skipped for Gaudi")
-    def test_torch_fx_output_loss(self):
-        super().test_torch_fx_output_loss()
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if (
-            pipeline_test_casse_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    # special case for DoubleHeads model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = GPTJModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTJConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gptj_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model(*config_and_inputs)
-
-    def test_gptj_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_past(*config_and_inputs)
-
-    def test_gptj_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_attention_mask_past(*config_and_inputs)
-
-    def test_gptj_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_past_large_inputs(*config_and_inputs)
-
-    def test_gptj_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_gptj_gradient_checkpointing(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    @tooslow
-    def test_batch_generation(self):
-        # Marked as @tooslow due to GPU OOM
-        model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16)
-        model.to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")
-
-        tokenizer.padding_side = "left"
-
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
-        input_ids = inputs["input_ids"].to(torch_device)
-        token_type_ids = torch.cat(
-            [
-                input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
-                input_ids.new_full((input_ids.shape[0], 1), 500),
-            ],
-            dim=-1,
-        )
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-        )
-
-        outputs_tt = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"].to(torch_device),
-            token_type_ids=token_type_ids,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little over a year old and has been diagnosed with a heart murmur",
-            "Today, I’m going to talk about the most important thing in the",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "EleutherAI/gpt-j-6B"
-        model = GPTJModel.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16)
-        self.assertIsNotNone(model)
-
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-        expected_outputs = [
-            "hi<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Q: I have a question about the new version of the game. I have a question about the",
-            "Hello this is a very long sentence.\n\nA:\n\nI think the best way to understand this is to think of it",
-        ]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-        model = GPTJForCausalLM.from_pretrained(
-            "EleutherAI/gpt-j-6b",
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-            revision="float16",
-            torch_dtype=torch.float16,
-            quantization_config=quantization_config,
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(expected_outputs, output_fa_2)
-
-
-@require_torch
-class GPTJModelLanguageGenerationTest(unittest.TestCase):
-    @tooslow
-    def test_lm_generate_gptj(self):
-        # Marked as @tooslow due to GPU OOM
-        for checkpointing in [True, False]:
-            model = GPTJForCausalLM.from_pretrained(
-                "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16
-            )
-            if checkpointing:
-                model.gradient_checkpointing_enable()
-            else:
-                model.gradient_checkpointing_disable()
-            model.to(torch_device)
-            input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)  # The dog
-            # The dog is a man's best friend. It is a loyal companion, and it is a friend
-            expected_output_ids = [464, 3290, 318, 257, 582, 338, 1266, 1545, 13, 632, 318, 257, 9112, 15185, 11, 290, 340, 318, 257, 1545]  # fmt: skip
-            output_ids = model.generate(input_ids, do_sample=False)
-            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
-
-    @tooslow
-    def test_gptj_sample(self):
-        # Marked as @tooslow due to GPU OOM (issue #13676)
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")
-        model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16)
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
-        input_ids = tokenized.input_ids.to(torch_device)
-        output_ids = model.generate(input_ids, do_sample=True)
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        token_type_ids = tokenized.token_type_ids.to(torch_device)
-        output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
-        output_seq_tt = model.generate(
-            input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
-        )
-        output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
-        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
-
-        if torch_device != "cpu":
-            # currently this expect value is only for `cuda`
-            EXPECTED_OUTPUT_STR = (
-                "Today is a nice day and I've already been enjoying it. I walked to work with my wife"
-            )
-        else:
-            EXPECTED_OUTPUT_STR = "Today is a nice day and one of those days that feels a bit more alive. I am ready"
-
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-        self.assertTrue(
-            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
-        )  # token_type_ids should change output
-
-    @slow
-    def test_gptj_sample_max_time(self):
-        tokenizer = AutoTokenizer.from_pretrained("anton-l/gpt-j-tiny-random")
-        model = GPTJForCausalLM.from_pretrained("anton-l/gpt-j-tiny-random")
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
-        input_ids = tokenized.input_ids.to(torch_device)
-
-        MAX_TIME = 0.5
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=None, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-    @tooslow
-    def test_contrastive_search_gptj(self):
-        article = (
-            "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and "
-            "research laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
-        model = GPTJForCausalLM.from_pretrained(
-            "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16
-        ).to(torch_device)
-        input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-                "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, "
-                "United Kingdom with offices in Mountain View, San Francisco, New York City, Paris, Tokyo, Seoul, "
-                "Beijing, Singapore, Tel Aviv, Dublin, Sydney, and Melbourne.[1]\n\nContents\n\nIn 2010, Google's "
-                "parent company, Alphabet, announced a $500 million investment in DeepMind, with the aim of creating "
-                "a company that would apply deep learning to problems in healthcare, energy, transportation, and "
-                "other areas.[2]\n\nOn April 23, 2014, Google announced that it had acquired DeepMind for $400 "
-                "million in cash and stock.[3] The acquisition was seen as a way for Google to enter the "
-                "fast-growing field of artificial intelligence (AI), which it had so far avoided due to concerns "
-                'about ethical and social implications.[4] Google co-founder Sergey Brin said that he was "thrilled" '
-                'to have acquired DeepMind, and that it would "help us push the boundaries of AI even further."'
-                "[5]\n\nDeepMind's founders, Demis Hassabis and Mustafa Suleyman, were joined by a number of Google "
-                "employees"
-            ],
-        )
-
diff --git a/tests/transformers/tests/models/llama/__init__.py b/tests/transformers/tests/models/llama/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/llama/test_modeling_llama.py b/tests/transformers/tests/models/llama/test_modeling_llama.py
deleted file mode 100644
index 4ab2190ad5..0000000000
--- a/tests/transformers/tests/models/llama/test_modeling_llama.py
+++ /dev/null
@@ -1,665 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch LLaMA model."""
-
-import tempfile
-import unittest
-
-import pytest
-from parameterized import parameterized
-from transformers import is_torch_available, set_seed
-from optimum.habana.transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.testing_utils import (
-    require_bitsandbytes,
-    require_flash_attn,
-    require_torch,
-    require_torch_accelerator,
-    require_torch_gpu,
-    require_torch_sdpa,
-    slow,
-)
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-from optimum.habana.utils import set_seed
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-    from transformers import (
-        CodeLlamaTokenizer,
-        LlamaForCausalLM,
-        LlamaForSequenceClassification,
-        LlamaModel,
-        LlamaTokenizer,
-    )
-
-
-class LlamaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return LlamaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LlamaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = LlamaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = LlamaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = LlamaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (LlamaModel, LlamaForCausalLM, LlamaForSequenceClassification) if is_torch_available() else ()
-    all_generative_model_classes = (LlamaForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": LlamaModel,
-            "text-classification": LlamaForSequenceClassification,
-            "text-generation": LlamaForCausalLM,
-            "zero-shot": LlamaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-    fx_compatible = True
-
-    def setUp(self):
-        self.model_tester = LlamaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlamaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_llama_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = LlamaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_llama_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = LlamaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_llama_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(torch.float)
-        model = LlamaForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("Llama buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    def test_model_rope_scaling(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = LlamaModel(config)
-        original_model.to(torch_device)
-        original_model.eval()
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = LlamaModel(config)
-        scaled_model.to(torch_device)
-        scaled_model.eval()
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
-        else:
-            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2"
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
-    @require_flash_attn
-    @require_torch_gpu
-    @slow
-    def test_use_flash_attention_2_true(self):
-        """
-        NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model = model_class(config)
-                model.save_pretrained(tmp_dir)
-
-                new_model = LlamaForCausalLM.from_pretrained(
-                    tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
-                ).to("cuda")
-
-                self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
-
-                has_flash = False
-                for name, submodule in new_model.named_modules():
-                    if "FlashAttention" in submodule.__class__.__name__:
-                        has_flash = True
-                        break
-                if not has_flash:
-                    raise ValueError("The flash model should have flash attention layers")
-
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        max_new_tokens = 30
-
-        tokenizer = LlamaTokenizer.from_pretrained("saibo/llama-1B")
-
-        model_sdpa = LlamaForCausalLM.from_pretrained(
-            "saibo/llama-1B",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        ).to(torch_device)
-
-        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-        model_eager = LlamaForCausalLM.from_pretrained(
-            "saibo/llama-1B",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            attn_implementation="eager",
-        ).to(torch_device)
-
-        self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-        for name, submodule in model_eager.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                raise ValueError("The eager model should not have SDPA attention layers")
-
-        has_sdpa = False
-        for name, submodule in model_sdpa.named_modules():
-            if "SdpaAttention" in submodule.__class__.__name__:
-                has_sdpa = True
-                break
-        if not has_sdpa:
-            raise ValueError("The SDPA model should have SDPA attention layers")
-
-        texts = [
-            "hi here's a longer context, getting longer and",
-            "Hello this is a very long sentence my friend, very long for real",
-            "Today I am in Paris and",
-        ]
-
-        for padding_side in ["left", "right"]:
-            tokenizer.padding_side = padding_side
-            tokenizer.pad_token = tokenizer.eos_token
-
-            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
-
-            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-
-            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-            self.assertTrue(torch.allclose(res_eager, res_sdpa))
-
-
-@require_torch
-class LlamaIntegrationTest(unittest.TestCase):
-    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
-    @slow
-    def test_model_7b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="auto")
-        out = model(torch.tensor([input_ids]))
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = torch.tensor([[-6.6550, -4.1227, -4.9859, -3.2406, 0.8262, -3.0033, 1.2964, -3.3699]])
-        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = torch.tensor([-12.8281, -7.4453, -0.4639, -8.0625, -7.2500, -8.0000, -6.4883, -7.7695, -7.8438, -7.0312, -6.2188, -7.1328, -1.8496, 1.9961, -8.6250, -6.7227, -12.8281, -6.9492, -7.0742, -7.7852, -7.5820, -7.9062, -6.9375, -7.9805, -8.3438, -8.1562, -8.0469, -7.6250, -7.7422, -7.3398,])  # fmt: skip
-        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
-
-    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
-    @slow
-    def test_model_13b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf", device_map="auto")
-        out = model(torch.tensor(input_ids))
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = torch.tensor([[-2.0622, -1.2794, -1.1638, -0.9788, -1.4603, -1.0238, -1.7893, -1.4411]])
-        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = torch.tensor([-8.1406, -8.0547, 2.7461, -1.2344, -0.1448, -1.8262, -1.0020, -1.8154, -1.6895, -1.8516, -2.3574, -0.9277, 3.7598, 6.5742, -1.2998, -0.1177, -8.1406, -2.9688, -2.9199, -3.1699, -3.5254, -2.3555, -2.7988, -3.4141, -2.8262, -4.5195, -3.3379, -3.3164, -2.7832, -3.0273])  # fmt: skip
-        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
-
-    @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
-    @slow
-    def test_model_13bf_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", device_map="auto")
-        out = model(torch.tensor(input_ids))
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = torch.tensor([[-0.8562, -1.8520, -0.7551, -0.4162, -1.5161, -1.2038, -2.4823, -2.3254]])
-        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = torch.tensor([-2.2227, 4.8828, 0.9023, -0.4578, -0.7871, -0.1033, -0.6221, -0.5786, -0.7803, -1.0674, -1.2920, -0.1570, 0.8008, 2.0723, -0.9497, 0.2771, -2.2227, -0.7612, -1.4346, -1.2061, -1.6426, -0.3000, -0.7139, -1.1934, -1.8691, -1.6973, -1.5947, -1.2705, -0.3523, -0.5513])  # fmt: skip
-        torch.testing.assert_close(out.mean(-1), EXPECTED_SLICE, atol=1e-2, rtol=1e-2)
-
-    @unittest.skip(
-        "Logits are not exactly the same, once we fix the instabalities somehow, will update! Also it is gonna be a `too_slow` test"
-    )
-    @slow
-    def test_model_70b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-70b-hf", device_map="auto")
-        out = model(torch.tensor(input_ids))
-
-        EXPECTED_MEAN = torch.tensor(
-            [[-4.2327, -3.3360, -4.6665, -4.7631, -1.8180, -3.4170, -1.4211, -3.1810]], dtype=torch.float32
-        )
-        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        EXPECTED_SLICE = torch.tensor([-9.4922, -3.9551, 1.7998, -5.6758, -5.1055, -5.8984, -4.8320, -6.8086, -6.5391, -5.6172, -5.5820, -5.5352, 1.7881, 3.6289, -6.5117, -3.4785, -9.5000, -6.0352, -6.8125, -6.0195, -6.6836, -5.4727, -6.2812, -6.0391, -7.3398, -7.4297, -7.4844, -6.5820, -5.8789, -5.5312])  # fmt: skip
-        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
-
-    @unittest.skip("Model is curently gated")
-    @slow
-    def test_model_13b_greedy_generation(self):
-        EXPECTED_TEXT_COMPLETION = """Simply put, the theory of relativity states that 1) the laws of physics are the same everywhere in the universe and 2) the passage of time and the length of objects can vary depending on the observer\'s frame of reference.\n\nThe first part of the theory, that the laws of physics are the same everywhere, is known as the "princi"""
-        prompt = "Simply put, the theory of relativity states that "
-        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
-        input_ids = tokenizer.encode(prompt, return_tensors="pt")
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-13b-chat-hf", device_map="sequential", use_safetensors=False
-        )
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=64, top_p=None, temperature=1, do_sample=False)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-
-@require_torch
-class CodeLlamaIntegrationTest(unittest.TestCase):
-    PROMPTS = [
-        '''def remove_non_ascii(s: str) -> str:
-    """ <FILL_ME>
-    return result
-''',
-        """# Installation instructions:
-    ```bash
-<FILL_ME>
-    ```
-This downloads the LLaMA inference code and installs the repository as a local pip package.
-""",
-        """class InterfaceManagerFactory(AbstractManagerFactory):
-    def __init__(<FILL_ME>
-def main():
-    factory = InterfaceManagerFactory(start=datetime.now())
-    managers = []
-    for i in range(10):
-        managers.append(factory.build(id=i))
-""",
-        """/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/
-theorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :
-π₁ P = 0 ↔ <FILL_ME> = 0 :=
-begin
-split,
-{ intros h f,
-    rw pi_1_etalisation at h,
-    simp [h],
-    refl
-},
-{ intro h,
-    have := @quasi_adjoint C D P,
-    simp [←pi_1_etalisation, this, h],
-    refl
-}
-end
-""",
-    ]
-
-    @require_torch_accelerator
-    @slow
-    def test_model_7b_logits(self):
-        model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf").to(torch_device)
-        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
-        # Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
-        # meaning by default this supports passing splitted list of inputs
-        processed_text = tokenizer.batch_decode(tokenizer(self.PROMPTS)["input_ids"], add_special_tokens=False)
-        # fmt: off
-        EXPECTED_TEXT = [
-            '<s> <PRE> def remove_non_ascii(s: str) -> str:\n    """  <SUF>\n    return result\n <MID>',
-            '<s> <PRE> # Installation instructions:\n    ```bash\n <SUF>\n    ```\nThis downloads the LLaMA inference code and installs the repository as a local pip package.\n <MID>',
-            '<s> <PRE> class InterfaceManagerFactory(AbstractManagerFactory):\n    def __init__( <SUF>\ndef main():\n    factory = InterfaceManagerFactory(start=datetime.now())\n    managers = []\n    for i in range(10):\n        managers.append(factory.build(id=i))\n <MID>',
-            '<s> <PRE> /-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/\ntheorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :\nπ₁ P = 0 ↔  <SUF> = 0 :=\nbegin\nsplit,\n{ intros h f,\n    rw pi_1_etalisation at h,\n    simp [h],\n    refl\n},\n{ intro h,\n    have := @quasi_adjoint C D P,\n    simp [←pi_1_etalisation, this, h],\n    refl\n}\nend\n <MID>'
-        ]
-        # fmt: on
-        self.assertEqual(processed_text, EXPECTED_TEXT)
-        processed_text_suffix_first = tokenizer.batch_decode(
-            tokenizer(self.PROMPTS, suffix_first=True, add_special_tokens=False)["input_ids"]
-        )
-
-        # fmt: off
-        EXPECTED_TEXT = [
-            '<PRE> <SUF>\n    return result\n <MID> def remove_non_ascii(s: str) -> str:\n    """ ',
-            '<PRE> <SUF>\n    ```\nThis downloads the LLaMA inference code and installs the repository as a local pip package.\n <MID> # Installation instructions:\n    ```bash\n',
-            '<PRE> <SUF>\ndef main():\n    factory = InterfaceManagerFactory(start=datetime.now())\n    managers = []\n    for i in range(10):\n        managers.append(factory.build(id=i))\n <MID> class InterfaceManagerFactory(AbstractManagerFactory):\n    def __init__(',
-            '<PRE> <SUF> = 0 :=\nbegin\nsplit,\n{ intros h f,\n    rw pi_1_etalisation at h,\n    simp [h],\n    refl\n},\n{ intro h,\n    have := @quasi_adjoint C D P,\n    simp [←pi_1_etalisation, this, h],\n    refl\n}\nend\n <MID> /-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/\ntheorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :\nπ₁ P = 0 ↔ '
-        ]
-        EXPECTED_IDS = torch.tensor([[    1, 32007, 822, 3349, 29918, 5464, 29918, 294, 18869, 29898,29879, 29901, 851, 29897, 1599, 851, 29901, 13, 1678, 9995, 29871, 32008, 13, 1678, 736, 1121, 13, 32009, 15941, 1661, 29899, 28599, 2687, 4890, 515, 263, 1347, 29889, 13, 13, 1678, 826, 3174, 29901, 13, 4706, 269, 29901, 450, 1347, 304, 3349, 1661, 29899, 28599, 2687, 4890, 515, 29889, 13, 13, 1678, 16969, 29901, 13, 4706, 450, 1347, 411, 1661, 29899, 28599, 2687, 4890, 6206, 29889, 13, 1678, 9995, 13, 1678, 1121, 353, 5124, 13, 1678, 363, 274, 297, 269, 29901, 13, 4706, 565, 4356, 29898, 29883, 29897, 529, 29871, 29896, 29906, 29947, 29901, 13, 9651, 1121, 4619, 274, 32010, 2]])
-        # fmt: on
-        self.assertEqual(processed_text_suffix_first, EXPECTED_TEXT)
-        input_ids = tokenizer(self.PROMPTS[0], return_tensors="pt")["input_ids"]
-        generated_ids = model.generate(input_ids.to(torch_device), max_new_tokens=128)
-        torch.testing.assert_close(generated_ids, EXPECTED_IDS)
-
-        EXPECTED_INFILLING = [
-            '<s> <PRE> def remove_non_ascii(s: str) -> str:\n    """  <SUF>\n    return result\n <MID>Remove non-ASCII characters from a string.\n\n    Args:\n        s: The string to remove non-ASCII characters from.\n\n    Returns:\n        The string with non-ASCII characters removed.\n    """\n    result = ""\n    for c in s:\n        if ord(c) < 128:\n            result += c <EOT></s>'
-        ]
-        infilling = tokenizer.batch_decode(generated_ids)
-        self.assertEqual(infilling, EXPECTED_INFILLING)
diff --git a/tests/transformers/tests/models/mistral/__init__.py b/tests/transformers/tests/models/mistral/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/mistral/test_modeling_mistral.py b/tests/transformers/tests/models/mistral/test_modeling_mistral.py
deleted file mode 100644
index 6014175668..0000000000
--- a/tests/transformers/tests/models/mistral/test_modeling_mistral.py
+++ /dev/null
@@ -1,634 +0,0 @@
-
-# coding=utf-8
-# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Mistral model. """
-
-
-import gc
-import tempfile
-import unittest
-
-import pytest
-
-from transformers import AutoTokenizer, MistralConfig, is_torch_available, set_seed
-from transformers.testing_utils import (
-    backend_empty_cache,
-    is_flaky,
-    require_bitsandbytes,
-    require_flash_attn,
-    require_torch,
-    require_torch_gpu,
-    require_torch_sdpa,
-    slow,
-    torch_device,
-)
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        MistralForCausalLM,
-        MistralForSequenceClassification,
-        MistralModel,
-    )
-
-
-class MistralModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MistralConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Mistral
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MistralModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Mistral
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MistralModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Mistral
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MistralForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Mistral
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = MistralForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (MistralModel, MistralForCausalLM, MistralForSequenceClassification) if is_torch_available() else ()
-    )
-    all_generative_model_classes = (MistralForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MistralModel,
-            "text-classification": MistralForSequenceClassification,
-            "text-generation": MistralForCausalLM,
-            "zero-shot": MistralForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
-    def setUp(self):
-        self.model_tester = MistralModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Mistral_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MistralForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mistral_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MistralForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mistral_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(torch.float)
-        model = MistralForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("Mistral buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip("Mistral uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Mistral apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("Mistral flash attention does not support right padding")
-
-
-@require_torch_gpu
-class MistralIntegrationTest(unittest.TestCase):
-    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
-    # Depending on the hardware we get different logits / generations
-    cuda_compute_capability_major_version = None
-
-    @classmethod
-    def setUpClass(cls):
-        if is_torch_available() and torch.cuda.is_available():
-            # 8 is for A100 / A10 and 7 for T4
-            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
-
-    def tearDown(self):
-        torch.cuda.empty_cache()
-        gc.collect()
-
-    @slow
-    def test_model_7b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map="auto", torch_dtype=torch.float16
-        )
-        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-        with torch.no_grad():
-            out = model(input_ids).logits.cpu()
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
-        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-
-        EXPECTED_SLICE = {
-            7: torch.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787, 1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781]),
-            8: torch.tensor([-5.8711, -5.8555, -0.1050, -4.7148, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -1.0781, 1.7568, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711]),
-        }  # fmt: skip
-
-        print(out[0, 0, :30])
-        torch.testing.assert_close(
-            out[0, 0, :30], EXPECTED_SLICE[self.cuda_compute_capability_major_version], atol=1e-4, rtol=1e-4
-        )
-
-        del model
-        backend_empty_cache(torch_device)
-        gc.collect()
-
-    @slow
-    @require_bitsandbytes
-    def test_model_7b_generation(self):
-        EXPECTED_TEXT_COMPLETION = {
-            7: "My favourite condiment is 100% ketchup. I love it on everything. I'm not a big",
-            8: "My favourite condiment is 100% ketchup. I’m not a fan of mustard, mayo,",
-        }
-
-        prompt = "My favourite condiment is "
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map={"": torch_device}, load_in_4bit=True
-        )
-        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text)
-
-        del model
-        backend_empty_cache(torch_device)
-        gc.collect()
-
-    @require_bitsandbytes
-    @slow
-    @require_flash_attn
-    def test_model_7b_long_prompt(self):
-        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
-        # An input with 4097 tokens that is above the size of the sliding window
-        input_ids = [1] + [306, 338] * 2048
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1",
-            device_map={"": torch_device},
-            load_in_4bit=True,
-            attn_implementation="flash_attention_2",
-        )
-        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-        # Assisted generation
-        assistant_model = model
-        assistant_model.generation_config.num_assistant_tokens = 2
-        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
-        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-        del assistant_model
-        del model
-        backend_empty_cache(torch_device)
-        gc.collect()
-
-    @slow
-    @require_torch_sdpa
-    def test_model_7b_long_prompt_sdpa(self):
-        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
-        # An input with 4097 tokens that is above the size of the sliding window
-        input_ids = [1] + [306, 338] * 2048
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.float16
-        )
-        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-        # Assisted generation
-        assistant_model = model
-        assistant_model.generation_config.num_assistant_tokens = 2
-        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
-        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-        del assistant_model
-
-        backend_empty_cache(torch_device)
-        gc.collect()
-
-        EXPECTED_TEXT_COMPLETION = """My favourite condiment is 100% ketchup. I love it on everything. I’m not a big"""
-        prompt = "My favourite condiment is "
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
-
-        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-    @slow
-    def test_speculative_generation(self):
-        EXPECTED_TEXT_COMPLETION = {
-            7: "My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs",
-            8: "My favourite condiment is 100% Sriracha. I love the heat, the sweetness, the tang",
-        }
-        prompt = "My favourite condiment is "
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1", device_map="auto", torch_dtype=torch.float16
-        )
-        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
-
-        # greedy generation outputs
-        set_seed(0)
-        generated_ids = model.generate(
-            input_ids, max_new_tokens=20, do_sample=True, temperature=0.3, assistant_model=model
-        )
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text)
-
-        del model
-        backend_empty_cache(torch_device)
-        gc.collect()
-
diff --git a/tests/transformers/tests/models/mixtral/__init__.py b/tests/transformers/tests/models/mixtral/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/mixtral/test_modeling_mixtral.py b/tests/transformers/tests/models/mixtral/test_modeling_mixtral.py
deleted file mode 100644
index 857a254a5d..0000000000
--- a/tests/transformers/tests/models/mixtral/test_modeling_mixtral.py
+++ /dev/null
@@ -1,611 +0,0 @@
-
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Mixtral model. """
-
-
-import tempfile
-import unittest
-
-import pytest
-
-from transformers import MixtralConfig, is_torch_available
-from transformers.testing_utils import (
-    is_flaky,
-    require_flash_attn,
-    require_torch,
-    require_torch_gpu,
-    require_torch_sdpa,
-    slow,
-    torch_device,
-)
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-
-    from transformers import MixtralForCausalLM, MixtralForSequenceClassification, MixtralModel
-
-
-class MixtralModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-        router_jitter_noise=0.1,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-        self.router_jitter_noise = router_jitter_noise
-
-    # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MixtralConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-            num_experts_per_tok=2,
-            num_local_experts=2,
-            router_jitter_noise=self.router_jitter_noise,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Mixtral
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MixtralModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Mixtral
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MixtralModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Mixtral
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MixtralForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Mixtral
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = MixtralForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Mixtral
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-# Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Mixtral
-class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (MixtralModel, MixtralForCausalLM, MixtralForSequenceClassification) if is_torch_available() else ()
-    )
-    all_generative_model_classes = (MixtralForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MixtralModel,
-            "text-classification": MixtralForSequenceClassification,
-            "text-generation": MixtralForCausalLM,
-            "zero-shot": MixtralForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    # TODO: @Fxmarty
-    @is_flaky(max_attempts=3, description="flaky on some models.")
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        super().test_eager_matches_sdpa_generate()
-
-    def setUp(self):
-        self.model_tester = MixtralModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MixtralConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Mixtral_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MixtralForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mixtral_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MixtralForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mixtral_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(torch.float)
-        model = MixtralForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("Mixtral buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip("Mixtral uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Mixtral apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        self.skipTest("Mixtral flash attention does not support right padding")
-
-    # Ignore copy
-    def test_load_balancing_loss(self):
-        r"""
-        Let's make sure we can actually compute the loss and do a backward on it.
-        """
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.num_local_experts = 8
-        config.output_router_logits = True
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1).to(torch_device)
-        model = MixtralForCausalLM(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask)
-        self.assertEqual(result.router_logits[0].shape, (91, config.num_local_experts))
-        torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(2, dtype=torch.float32), rtol=1e-2, atol=1e-2)
-
-        # First, we make sure that adding padding tokens doesn't change the loss
-        # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
-        pad_length = 1000
-        # Add padding tokens (assume that pad_token_id=1) to input_ids
-        padding_block = torch.ones(input_ids.shape[0], pad_length, dtype=torch.int32).to(torch_device)
-        padded_input_ids = torch.cat((padding_block, input_ids), dim=1)  # this is to simulate padding to the left
-        padded_attention_mask = padded_input_ids.ne(1).to(torch_device)
-
-        padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
-        torch.testing.assert_close(result.aux_loss.cpu(), padded_result.aux_loss.cpu(), rtol=1e-4, atol=1e-4)
-
-        # We make sure that the loss of includding padding tokens != the loss without padding tokens
-        # if attention_mask=None --> we don't exclude padding tokens
-        include_padding_result = model(padded_input_ids, attention_mask=None)
-
-        # This is to mimic torch.testing.assert_not_close
-        self.assertNotAlmostEqual(include_padding_result.aux_loss.item(), result.aux_loss.item())
-
-
-@require_torch
-class MixtralIntegrationTest(unittest.TestCase):
-    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
-    # Depending on the hardware we get different logits / generations
-    cuda_compute_capability_major_version = None
-
-    @classmethod
-    def setUpClass(cls):
-        if is_torch_available() and torch.cuda.is_available():
-            # 8 is for A100 / A10 and 7 for T4
-            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
-
-    @slow
-    @require_torch_gpu
-    def test_small_model_logits(self):
-        model_id = "hf-internal-testing/Mixtral-tiny"
-        dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)
-
-        model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
-            torch_device
-        )
-        # TODO: might need to tweak it in case the logits do not match on our daily runners
-        # these logits have been obtained with the original megablocks impelmentation.
-        EXPECTED_LOGITS = {
-            7: torch.Tensor([[0.1670, 0.1620, 0.6094], [-0.8906, -0.1588, -0.6060], [0.1572, 0.1290, 0.7246]]).to(
-                torch_device
-            ),
-            8: torch.Tensor([[0.1631, 0.1621, 0.6094], [-0.8906, -0.1621, -0.6094], [0.1572, 0.1270, 0.7227]]).to(
-                torch_device
-            ),
-        }
-        with torch.no_grad():
-            logits = model(dummy_input).logits
-
-        torch.testing.assert_close(
-            logits[0, :3, :3], EXPECTED_LOGITS[self.cuda_compute_capability_major_version], atol=1e-3, rtol=1e-3
-        )
-        torch.testing.assert_close(
-            logits[1, :3, :3], EXPECTED_LOGITS[self.cuda_compute_capability_major_version], atol=1e-3, rtol=1e-3
-        )
-
-    @slow
-    @require_torch_gpu
-    def test_small_model_logits_batched(self):
-        model_id = "hf-internal-testing/Mixtral-tiny"
-        dummy_input = torch.LongTensor([[0, 0, 0, 0, 0, 0, 1, 2, 3], [1, 1, 2, 3, 4, 5, 6, 7, 8]]).to(torch_device)
-        attention_mask = dummy_input.ne(0).to(torch.long)
-
-        model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
-            torch_device
-        )
-
-        # TODO: might need to tweak it in case the logits do not match on our daily runners
-        EXPECTED_LOGITS_LEFT = {
-            7: torch.Tensor(
-                [[0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007]],
-            ).to(torch_device),
-            8: torch.Tensor([[0.1914, 0.0508, 0.7188], [0.1953, 0.0510, 0.7227], [0.1973, 0.0562, 0.7148]]).to(
-                torch_device
-            ),
-        }
-
-        EXPECTED_LOGITS_LEFT_UNPADDED = {
-            7: torch.Tensor(
-                [[0.2212, 0.5200, -0.3816], [0.8213, -0.2313, 0.6069], [0.2664, -0.7090, 0.2468]],
-            ).to(torch_device),
-            8: torch.Tensor([[0.2217, 0.5195, -0.3828], [0.8203, -0.2295, 0.6055], [0.2676, -0.7109, 0.2461]]).to(
-                torch_device
-            ),
-        }
-
-        EXPECTED_LOGITS_RIGHT_UNPADDED = {
-            7: torch.Tensor([[0.2205, 0.1232, -0.1611], [-0.3484, 0.3030, -1.0312], [0.0742, 0.7930, 0.7969]]).to(
-                torch_device
-            ),
-            8: torch.Tensor([[0.2178, 0.1260, -0.1621], [-0.3496, 0.2988, -1.0312], [0.0693, 0.7930, 0.8008]]).to(
-                torch_device
-            ),
-        }
-
-        with torch.no_grad():
-            logits = model(dummy_input, attention_mask=attention_mask).logits
-
-        torch.testing.assert_close(
-            logits[0, :3, :3], EXPECTED_LOGITS_LEFT[self.cuda_compute_capability_major_version], atol=1e-3, rtol=1e-3
-        )
-        torch.testing.assert_close(
-            logits[0, -3:, -3:],
-            EXPECTED_LOGITS_LEFT_UNPADDED[self.cuda_compute_capability_major_version],
-            atol=1e-3,
-            rtol=1e-3,
-        )
-        torch.testing.assert_close(
-            logits[1, -3:, -3:],
-            EXPECTED_LOGITS_RIGHT_UNPADDED[self.cuda_compute_capability_major_version],
-            atol=1e-3,
-            rtol=1e-3,
-        )
-
diff --git a/tests/transformers/tests/models/roberta/__init__.py b/tests/transformers/tests/models/roberta/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/roberta/test_modeling_roberta.py b/tests/transformers/tests/models/roberta/test_modeling_roberta.py
deleted file mode 100644
index eaa7aa7564..0000000000
--- a/tests/transformers/tests/models/roberta/test_modeling_roberta.py
+++ /dev/null
@@ -1,587 +0,0 @@
-
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import RobertaConfig, is_torch_available
-from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        RobertaForCausalLM,
-        RobertaForMaskedLM,
-        RobertaForMultipleChoice,
-        RobertaForQuestionAnswering,
-        RobertaForSequenceClassification,
-        RobertaForTokenClassification,
-        RobertaModel,
-    )
-    from transformers.models.roberta.modeling_roberta import (
-        ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
-        RobertaEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-ROBERTA_TINY = "sshleifer/tiny-distilroberta-base"
-
-
-class RobertaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return RobertaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = RobertaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RobertaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = RobertaForCausalLM(config=config).to(torch_device).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = RobertaForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = RobertaForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            RobertaForCausalLM,
-            RobertaForMaskedLM,
-            RobertaModel,
-            RobertaForSequenceClassification,
-            RobertaForTokenClassification,
-            RobertaForMultipleChoice,
-            RobertaForQuestionAnswering,
-        )
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (RobertaForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": RobertaModel,
-            "fill-mask": RobertaForMaskedLM,
-            "question-answering": RobertaForQuestionAnswering,
-            "text-classification": RobertaForSequenceClassification,
-            "text-generation": RobertaForCausalLM,
-            "token-classification": RobertaForTokenClassification,
-            "zero-shot": RobertaForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = RobertaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "FacebookAI/roberta-base"
-        model = RobertaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = RobertaEmbeddings(config=config)
-
-        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = torch.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = RobertaEmbeddings(config=config)
-
-        inputs_embeds = torch.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
-
-
-@require_torch
-class RobertaModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-
-        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
-        # roberta.eval()
-        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained("FacebookAI/roberta-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-
-        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
-        # roberta.eval()
-        # expected_slice = roberta.extract_features(input_ids)[:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_classification_head(self):
-        model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
-        # roberta.eval()
-        # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach()
-
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
-
diff --git a/tests/transformers/tests/models/swin/__init__.py b/tests/transformers/tests/models/swin/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/swin/test_modeling_swin.py b/tests/transformers/tests/models/swin/test_modeling_swin.py
deleted file mode 100644
index 41b26c11ee..0000000000
--- a/tests/transformers/tests/models/swin/test_modeling_swin.py
+++ /dev/null
@@ -1,507 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Swin model. """
-
-import collections
-import unittest
-
-from transformers import SwinConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import SwinBackbone, SwinForImageClassification, SwinForMaskedImageModeling, SwinModel
-    from transformers.models.swin.modeling_swin import SWIN_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
-
-
-class SwinModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        embed_dim=16,
-        depths=[1, 2, 1],
-        num_heads=[2, 2, 4],
-        window_size=2,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        use_absolute_embeddings=False,
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=True,
-        type_sequence_label_size=10,
-        encoder_stride=8,
-        out_features=["stage1", "stage2"],
-        out_indices=[1, 2],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.depths = depths
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.use_absolute_embeddings = use_absolute_embeddings
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.type_sequence_label_size = type_sequence_label_size
-        self.encoder_stride = encoder_stride
-        self.out_features = out_features
-        self.out_indices = out_indices
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return SwinConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            window_size=self.window_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            use_absolute_embeddings=self.use_absolute_embeddings,
-            path_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = SwinModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
-        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = SwinBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], 16, 16])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = SwinBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = SwinForMaskedImageModeling(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = SwinForMaskedImageModeling(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = SwinForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = SwinForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class SwinModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            SwinModel,
-            SwinBackbone,
-            SwinForImageClassification,
-            SwinForMaskedImageModeling,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": SwinModel, "image-classification": SwinForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = SwinModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SwinConfig, embed_dim=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # TODO: check if this works again for PyTorch 2.x.y
-    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        super().test_training_gradient_checkpointing()
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @unittest.skip(reason="Swin does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Swin Transformer does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            expected_num_attentions = len(self.model_tester.depths)
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            window_size_squared = config.window_size**2
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # also another +1 for reshaped_hidden_states
-            added_hidden_states = 1 if model_class.__name__ == "SwinBackbone" else 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-
-    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
-        model = model_class(config)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        hidden_states = outputs.hidden_states
-
-        expected_num_layers = getattr(
-            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
-        )
-        self.assertEqual(len(hidden_states), expected_num_layers)
-
-        # Swin has a different seq_length
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-
-        self.assertListEqual(
-            list(hidden_states[0].shape[-2:]),
-            [num_patches, self.model_tester.embed_dim],
-        )
-
-        if not model_class.__name__ == "SwinBackbone":
-            reshaped_hidden_states = outputs.reshaped_hidden_states
-            self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
-
-            batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
-            reshaped_hidden_states = (
-                reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1)
-            )
-            self.assertListEqual(
-                list(reshaped_hidden_states.shape[-2:]),
-                [num_patches, self.model_tester.embed_dim],
-            )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-    def test_hidden_states_output_with_padding(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.patch_size = 3
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
-        padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/swin-tiny-patch4-window7-224"
-        model = SwinModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-@require_vision
-@require_torch
-class SwinModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = SwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224").to(torch_device)
-        image_processor = self.default_image_processor
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = torch.tensor([-0.0948, -0.6454, -0.0921]).to(torch_device)
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-
-@require_torch
-class SwinBackboneTest(unittest.TestCase):
-    all_model_classes = (SwinBackbone,) if is_torch_available() else ()
-    config_class = SwinConfig
-
-    def setUp(self):
-        self.model_tester = SwinModelTester(self)
diff --git a/tests/transformers/tests/models/t5/__init__.py b/tests/transformers/tests/models/t5/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/t5/test_modeling_t5.py b/tests/transformers/tests/models/t5/test_modeling_t5.py
deleted file mode 100644
index a24cd608de..0000000000
--- a/tests/transformers/tests/models/t5/test_modeling_t5.py
+++ /dev/null
@@ -1,1639 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import os
-import pickle
-import tempfile
-import unittest
-
-from transformers import T5Config, is_torch_available
-from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
-from transformers.testing_utils import (
-    require_accelerate,
-    require_sentencepiece,
-    require_tokenizers,
-    require_torch,
-    slow,
-    torch_device,
-)
-from transformers.utils import cached_property, is_torch_fx_available
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
-
-
-if is_torch_fx_available():
-    from transformers.utils.fx import symbolic_trace
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        AutoTokenizer,
-        ByT5Tokenizer,
-        T5EncoderModel,
-        T5ForConditionalGeneration,
-        T5ForQuestionAnswering,
-        T5ForSequenceClassification,
-        T5ForTokenClassification,
-        T5Model,
-        T5Tokenizer,
-    )
-    from transformers.models.t5.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-
-class T5ModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=7,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-        decoder_layers=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-
-    def get_large_model_config(self):
-        return T5Config.from_pretrained("google-t5/t5-base")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def get_pipeline_config(self):
-        return T5Config(
-            vocab_size=166,  # t5 forces 100 extra tokens
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def get_config(self):
-        return T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def check_prepare_lm_labels_via_shift_left(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # make sure that lm_labels are correctly padded from the right
-        lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
-
-        # add casaul pad token mask
-        triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
-        lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
-        decoder_input_ids = model._shift_right(lm_labels)
-
-        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
-            # first item
-            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
-            if i < decoder_input_ids_slice.shape[-1]:
-                if i < decoder_input_ids.shape[-1] - 1:
-                    # items before diagonal
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
-                    )
-                # pad items after diagonal
-                if i < decoder_input_ids.shape[-1] - 2:
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
-                    )
-            else:
-                # all items after square
-                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.num_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_with_lm_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
-        self.parent.assertEqual(outputs["loss"].size(), ())
-
-    def create_and_check_with_sequence_classification_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        labels = torch.tensor([1] * self.batch_size, dtype=torch.long, device=torch_device)
-        model = T5ForSequenceClassification(config=config).to(torch_device).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=input_ids,
-            labels=labels,
-        )
-        # self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, config.num_labels))
-        self.parent.assertEqual(outputs["loss"].size(), ())
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config).get_decoder().to(torch_device).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config).get_decoder()
-        model.to(torch_device)
-        model.eval()
-
-        # create attention mask
-        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = torch.cat(
-            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config).get_decoder().to(torch_device).eval()
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([attention_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_generate_with_past_key_values(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
-        torch.manual_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-        )
-        torch.manual_seed(0)
-        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config).to(torch_device).half().eval()
-        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(torch.isnan(output).any().item())
-
-    def create_and_check_encoder_decoder_shared_weights(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        for model_class in [T5Model, T5ForConditionalGeneration]:
-            torch.manual_seed(0)
-            model = model_class(config=config).to(torch_device).eval()
-            # load state dict copies weights but does not tie them
-            model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
-
-            torch.manual_seed(0)
-            tied_config = copy.deepcopy(config)
-            tied_config.tie_encoder_decoder = True
-            tied_model = model_class(config=tied_config).to(torch_device).eval()
-
-            model_result = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            tied_model_result = tied_model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # check that models has less parameters
-            self.parent.assertLess(
-                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-            )
-            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-            # check that outputs are equal
-            self.parent.assertTrue(
-                torch.allclose(
-                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
-                )
-            )
-
-            # check that outputs after saving and loading are equal
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tied_model.save_pretrained(tmpdirname)
-                tied_model = model_class.from_pretrained(tmpdirname)
-                tied_model.to(torch_device)
-                tied_model.eval()
-
-                # check that models has less parameters
-                self.parent.assertLess(
-                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-                )
-                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-                tied_model_result = tied_model(
-                    input_ids=input_ids,
-                    decoder_input_ids=decoder_input_ids,
-                    attention_mask=attention_mask,
-                    decoder_attention_mask=decoder_attention_mask,
-                )
-
-                # check that outputs are equal
-                self.parent.assertTrue(
-                    torch.allclose(
-                        model_result[0][0, :, random_slice_idx],
-                        tied_model_result[0][0, :, random_slice_idx],
-                        atol=1e-4,
-                    )
-                )
-
-    def check_resize_embeddings_t5_v1_1(
-        self,
-        config,
-    ):
-        prev_vocab_size = config.vocab_size
-
-        config.tie_word_embeddings = False
-        model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
-        model.resize_token_embeddings(prev_vocab_size - 10)
-
-        self.parent.assertEqual(model.get_input_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.get_output_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.config.vocab_size, prev_vocab_size - 10)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (T5Model, T5ForConditionalGeneration, T5ForSequenceClassification, T5ForQuestionAnswering)
-        if is_torch_available()
-        else ()
-    )
-    all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": T5ForConditionalGeneration,
-            "feature-extraction": T5Model,
-            "question-answering": T5ForQuestionAnswering,
-            "summarization": T5ForConditionalGeneration,
-            "text-classification": T5ForSequenceClassification,
-            "text2text-generation": T5ForConditionalGeneration,
-            "translation": T5ForConditionalGeneration,
-            "zero-shot": T5ForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_resize_embeddings = True
-    test_model_parallel = True
-    is_encoder_decoder = True
-    # The small T5 model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = T5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
-    # `src/transformers/data/processors/squad.py` (where this test fails for this model)
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_case_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if tokenizer_name is None:
-            return True
-        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        if not is_torch_fx_available() or not self.fx_compatible:
-            return
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "T5ForSequenceClassification":
-                continue
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    labels = inputs.get("labels", None)
-                    input_names = [
-                        "attention_mask",
-                        "decoder_attention_mask",
-                        "decoder_input_ids",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                    ]
-                    if labels is not None:
-                        input_names.append("labels")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-
-                    model_output = model(**filtered_inputs)
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                else:
-                    input_names = [
-                        "attention_mask",
-                        "bbox",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                        "pixel_values",
-                        "token_type_ids",
-                        "visual_feats",
-                        "visual_pos",
-                    ]
-
-                    labels = inputs.get("labels", None)
-                    start_positions = inputs.get("start_positions", None)
-                    end_positions = inputs.get("end_positions", None)
-                    if labels is not None:
-                        input_names.append("labels")
-                    if start_positions is not None:
-                        input_names.append("start_positions")
-                    if end_positions is not None:
-                        input_names.append("end_positions")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-
-                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
-                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
-                    ):
-                        model.config.problem_type = "single_label_classification"
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                    model_output = model(**filtered_inputs)
-
-            except Exception as e:
-                self.fail(f"Couldn't trace module: {e}")
-
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
-
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
-
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
-
-            # Test that the model can be serialized and restored properly
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
-                try:
-                    with open(pkl_file_name, "wb") as f:
-                        pickle.dump(traced_model, f)
-                    with open(pkl_file_name, "rb") as f:
-                        loaded = pickle.load(f)
-                except Exception as e:
-                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
-
-                loaded_output = loaded(**filtered_inputs)
-                loaded_output = flatten_output(loaded_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], loaded_output[i]),
-                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_shift_right(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # check that gated gelu feed forward and different word embeddings work
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
-
-    # T5ForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (T5Model, T5ForConditionalGeneration, T5ForQuestionAnswering):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
-    def test_config_and_model_silu_gated(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        config.feed_forward_proj = "gated-silu"
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
-
-    def test_with_sequence_classification_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_attn_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_3d_attn_mask(self):
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-
-        attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.encoder_seq_length, self.model_tester.encoder_seq_length],
-            vocab_size=2,
-        )
-        decoder_attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.decoder_seq_length],
-            vocab_size=2,
-        )
-
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    @unittest.skip("Does not support on gaudi.")
-    def test_generate_with_past_key_values(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
-
-    def test_encoder_decoder_shared_weights(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
-
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_v1_1_resize_embeddings(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        self.model_tester.check_resize_embeddings_t5_v1_1(config)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-t5/t5-small"
-        model = T5Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = T5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/t5_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
-    def test_generate_with_head_masking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        max_length = config_and_inputs[1].shape[-1] + 3
-        model = T5ForConditionalGeneration(config).eval()
-        model.to(torch_device)
-
-        head_masking = {
-            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
-            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
-            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
-        }
-
-        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-            head_masks = {name: mask}
-            # Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
-            if name == "head_mask":
-                head_masks["decoder_head_mask"] = torch.ones(
-                    config.num_decoder_layers, config.num_heads, device=torch_device
-                )
-
-            out = model.generate(
-                config_and_inputs[1],
-                num_beams=1,
-                max_length=max_length,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                **head_masks,
-            )
-            # We check the state of decoder_attentions and cross_attentions just from the last step
-            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
-
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        pass
-
-    @unittest.skip("Does not support conversations.")
-    def test_pipeline_conversational(self):
-        pass
-
-
-class T5EncoderOnlyModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        # For common tests
-        use_attention_mask=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        is_training=False,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        is_encoder_decoder=False,
-        eos_token_id=1,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        # For common tests
-        self.seq_length = self.encoder_seq_length
-        self.use_attention_mask = use_attention_mask
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.is_encoder_decoder = is_encoder_decoder
-        self.scope = None
-        self.is_training = is_training
-
-    def get_large_model_config(self):
-        return T5Config.from_pretrained("google-t5/t5-base")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        config = T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = T5EncoderModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-        )
-        result = model(input_ids=input_ids)
-        encoder_output = result.last_hidden_state
-
-        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = T5EncoderModel(config=config).to(torch_device).half().eval()
-        output = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(torch.isnan(output).any().item())
-
-    def create_and_check_with_token_classification_head(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        labels = torch.tensor([1] * self.seq_length * self.batch_size, dtype=torch.long, device=torch_device)
-        model = T5ForTokenClassification(config=config).to(torch_device).eval()
-        outputs = model(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=attention_mask,
-        )
-        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.seq_length, config.num_labels))
-        self.parent.assertEqual(outputs["loss"].size(), ())
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-class T5EncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (T5EncoderModel, T5ForTokenClassification) if is_torch_available() else ()
-    test_pruning = False
-    test_resize_embeddings = False
-    test_model_parallel = True
-    pipeline_model_mapping = (
-        {
-            "token-classification": T5ForTokenClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (T5EncoderModel,) if is_torch_available() else ()
-
-    def setUp(self):
-        self.model_tester = T5EncoderOnlyModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_with_token_classification_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_token_classification_head(*config_and_inputs)
-
-
-def use_task_specific_params(model, task):
-    model.config.update(model.config.task_specific_params[task])
-
-
-@require_torch
-@require_accelerate
-@require_tokenizers
-@slow
-class T5ModelFp16Tests(unittest.TestCase):
-    def test_fp16_fp32_conversion(self):
-        r"""
-        A test to check whether the argument `keep_in_fp32_modules` correctly does its job
-        """
-        orig_import = __import__
-        accelerate_mock = unittest.mock.Mock()
-
-        # mock import of accelerate
-        def import_accelerate_mock(name, *args, **kwargs):
-            if name == "accelerate":
-                if accelerate_available:
-                    return accelerate_mock
-                else:
-                    raise ImportError
-            return orig_import(name, *args, **kwargs)
-
-        # Load without using `accelerate`
-        with unittest.mock.patch("builtins.__import__", side_effect=import_accelerate_mock):
-            accelerate_available = False
-
-            model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", torch_dtype=torch.float16)
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
-
-            # Load without in bf16
-            model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", torch_dtype=torch.bfloat16)
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
-
-        # Load using `accelerate` in bf16
-        model = T5ForConditionalGeneration.from_pretrained(
-            "google-t5/t5-small", torch_dtype=torch.bfloat16, device_map="auto"
-        )
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
-
-        # Load using `accelerate` in bf16
-        model = T5ForConditionalGeneration.from_pretrained(
-            "google-t5/t5-small", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
-        )
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.bfloat16)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.bfloat16)
-
-        # Load without using `accelerate`
-        model = T5ForConditionalGeneration.from_pretrained(
-            "google-t5/t5-small", torch_dtype=torch.float16, low_cpu_mem_usage=True
-        )
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
-
-        # Load using `accelerate`
-        model = T5ForConditionalGeneration.from_pretrained(
-            "google-t5/t5-small", torch_dtype=torch.float16, device_map="auto"
-        )
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == torch.float16)
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class T5ModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return T5ForConditionalGeneration.from_pretrained("google-t5/t5-base").to(torch_device)
-
-    @cached_property
-    def tokenizer(self):
-        return T5Tokenizer.from_pretrained("google-t5/t5-base")
-
-    @slow
-    def test_torch_quant(self):
-        r"""
-        Test that a simple `torch.quantization.quantize_dynamic` call works on a T5 model.
-        """
-        model_name = "google/flan-t5-small"
-        tokenizer = T5Tokenizer.from_pretrained(model_name)
-        model = T5ForConditionalGeneration.from_pretrained(model_name)
-        model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
-        input_text = "Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?"
-        input_ids = tokenizer(input_text, return_tensors="pt").input_ids
-        _ = model.generate(input_ids)
-
-    @slow
-    def test_small_generation(self):
-        model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small").to(torch_device)
-        model.config.max_length = 8
-        model.config.num_beams = 1
-        model.config.do_sample = False
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        input_ids = tokenizer("summarize: Hello there", return_tensors="pt").input_ids.to(torch_device)
-
-        sequences = model.generate(input_ids)
-
-        output_str = tokenizer.batch_decode(sequences, skip_special_tokens=True)[0]
-        self.assertTrue(output_str == "Hello there!")
-
-    @slow
-    def test_small_integration_test(self):
-        """
-        For comparision run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_t5_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small").to(torch_device)
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="pt").input_ids
-        labels = tokenizer("Hi I am", return_tensors="pt").input_ids
-
-        loss = model(input_ids.to(torch_device), labels=labels.to(torch_device)).loss
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -19.0845
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_v1_1_integration_test(self):
-        """
-        For comparision run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_t5_v1_1_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_v1_1_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small").to(torch_device)
-        tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="pt").input_ids
-        labels = tokenizer("Hi I am", return_tensors="pt").input_ids
-
-        loss = model(input_ids.to(torch_device), labels=labels.to(torch_device)).loss
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -59.0293
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_byt5_integration_test(self):
-        """
-        For comparision run:
-        >>> import t5  # pip install t5==0.9.1
-
-        >>> path_to_byt5_small_checkpoint = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_tf_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = t5.data.ByteVocabulary()
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = T5ForConditionalGeneration.from_pretrained("google/byt5-small").to(torch_device)
-        tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="pt").input_ids
-        labels = tokenizer("Hi I am", return_tensors="pt").input_ids
-
-        loss = model(input_ids.to(torch_device), labels=labels.to(torch_device)).loss
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -60.7397
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_summarization(self):
-        model = self.model
-        tok = self.tokenizer
-
-        FRANCE_ARTICLE = (  # @noqa
-            "Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
-            " Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane."
-            ' Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation."'
-            ' He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s'
-            " comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-            " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French"
-            " Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a"
-            " phone at the wreckage site. The two publications described the supposed video, but did not post it on"
-            " their websites. The publications said that they watched the video, which was found by a source close to"
-            " the investigation. \"One can hear cries of 'My God' in several languages,\" Paris Match reported."
-            ' "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the'
-            " cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the"
-            ' screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt,'
-            " editor-in-chief of Bild online. An official with France's accident investigation agency, the BEA, said"
-            " the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman"
-            " in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the"
-            ' reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said,'
-            ' but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be'
-            " sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by"
-            " specialized technicians working hand-in-hand with investigators. But none of the cell phones found so"
-            " far have been sent to the institute, Menichini said. Asked whether staff involved in the search could"
-            ' have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin'
-            ' Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match'
-            ' are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-            ' cell phones from the crash site after Bild and Paris Match published their reports. "That is something'
-            " we did not know before. ... Overall we can say many things of the investigation weren't revealed by the"
-            ' investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline'
-            " Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the"
-            " controls of Germanwings Flight 9525, which he's accused of deliberately crashing last week in the"
-            ' French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of'
-            ' severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school'
-            " discovered in an internal investigation, Lufthansa said, included medical documents he submitted in"
-            " connection with resuming his flight training. The announcement indicates that Lufthansa, the parent"
-            " company of Germanwings, knew of Lubitz's battle with depression, allowed him to continue training and"
-            " ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100%"
-            ' fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was'
-            " sharing the information and documents -- including training and medical records -- with public"
-            " prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the"
-            " past week to recover human remains and plane debris scattered across a steep mountainside. He saw the"
-            " crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash"
-            " site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late"
-            " Tuesday that no visible human remains were left at the site but recovery teams would keep searching."
-            " French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all"
-            " the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-            " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini said."
-            " Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew"
-            " on board. Check out the latest from our correspondents . The details about Lubitz's correspondence with"
-            " the flight school during his training were among several developments as investigators continued to"
-            " delve into what caused the crash and Lubitz's possible motive for downing the jet. A Lufthansa"
-            " spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his"
-            ' examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in'
-            " Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at"
-            " some point before his aviation career and underwent psychotherapy before he got his pilot's license."
-            " Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting aggressively before the"
-            " crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to"
-            " lose his pilot's license, a European government official briefed on the investigation told CNN on"
-            ' Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being'
-            " considered. Another source, a law enforcement official briefed on the investigation, also told CNN that"
-            " authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would"
-            " not be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had"
-            " seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded"
-            " he had psychological issues, the European government official said. But no matter what details emerge"
-            " about his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-            ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact'
-            " that maybe they weren't going to keep doing their job and they're upset about that and so they're"
-            ' suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to'
-            " also take that rage and turn it outward on 149 other people who had nothing to do with the person's"
-            ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
-            " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
-            " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
-            " Amiel and Anna-Maja Rappard contributed to this report."
-        )
-        SHORTER_ARTICLE = (
-            "(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
-            " and Faith Karimi contributed to this report."
-        )
-        IRAN_ARTICLE = (
-            "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran"
-            " in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively"
-            " block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger."
-            " Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli"
-            " Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a"
-            " letter to the Iranian leadership warning them away from a deal. The debate that has already begun since"
-            " the announcement of the new framework will likely result in more heat than light. It will not be helped"
-            " by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: ."
-            " The most misleading assertion, despite universal rejection by experts, is that the negotiations'"
-            " objective at the outset was the total elimination of any nuclear program in Iran. That is the position"
-            " of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it"
-            " had been, there would have been no Iranian team at the negotiating table. Rather, the objective has"
-            " always been to structure an agreement or series of agreements so that Iran could not covertly develop a"
-            " nuclear arsenal before the United States and its allies could respond. The new framework has exceeded"
-            " expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by"
-            " two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another"
-            " dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite"
-            " sharp accusations by some in the United States and its allies, Iran denies having such a program, and"
-            " U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's"
-            " continued cooperation with International Atomic Energy Agency inspections is further evidence on this"
-            " point, and we'll know even more about Iran's program in the coming months and years because of the deal."
-            " In fact, the inspections provisions that are part of this agreement are designed to protect against any"
-            " covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that"
-            " the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter"
-            " warning that a deal might be killed by Congress or a future president). This of course is not the case."
-            " The talks were between Iran and the five permanent members of the U.N. Security Council (United States,"
-            " United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has"
-            " played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement"
-            " reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran"
-            " and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement"
-            " contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the"
-            " case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased"
-            " or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes"
-            " Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear"
-            " sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going"
-            " forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such"
-            " a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the"
-            ' agreement should be a formal treaty requiring the Senate to "advise and consent." But the issue is not'
-            " suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New"
-            " START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement"
-            " with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement"
-            " will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove"
-            " most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally"
-            " some insist that any agreement must address Iranian missile programs, human rights violations or support"
-            " for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are"
-            " unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in"
-            " the negotiations would be a poison pill. This agreement should be judged on its merits and on how it"
-            " affects the security of our negotiating partners and allies, including Israel. Those judgments should be"
-            " fact-based, not based on questionable assertions or dubious assumptions."
-        )
-        ARTICLE_SUBWAY = (
-            "New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-
-        expected_summaries = [
-            'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a'
-            " cell phone video of the final seconds . \"one can hear cries of 'My God' in several languages,\" one"
-            " magazine says .",
-            "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a"
-            " preliminary examination into the situation in the occupied Palestinian territory . as members of the"
-            " court, Palestinians may be subject to counter-charges as well .",
-            "the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller:"
-            " the debate that has already begun since the announcement of the new framework will likely result in more"
-            " heat than light . the deal would reduce Iran's low-enriched uranium stockpile, cut centrifuges and"
-            " implement a rigorous inspection regime .",
-            "prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two"
-            ' criminal counts of "offering a false instrument for filing in the first degree" she has been married 10'
-            " times, with nine of her marriages occurring between 1999 and 2002 .",
-        ]
-
-        use_task_specific_params(model, "summarization")
-
-        dct = tok(
-            [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
-            padding="max_length",
-            truncation=True,
-            return_tensors="pt",
-        ).to(torch_device)
-        self.assertEqual(512, dct["input_ids"].shape[1])
-
-        hypotheses_batch = model.generate(
-            **dct,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=142,
-            min_length=56,
-            no_repeat_ngram_size=3,
-            do_sample=False,
-            early_stopping=True,
-        )
-
-        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertListEqual(
-            expected_summaries,
-            decoded,
-        )
-
-    @slow
-    def test_translation_en_to_de(self):
-        model = self.model
-        tok = self.tokenizer
-        use_task_specific_params(model, "translation_en_to_de")
-
-        en_text = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
-        expected_translation = (
-            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
-        )
-
-        input_ids = tok.encode(model.config.prefix + en_text, return_tensors="pt")
-        input_ids = input_ids.to(torch_device)
-        output = model.generate(input_ids)
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertEqual(translation, expected_translation)
-
-    @slow
-    def test_translation_en_to_fr(self):
-        model = self.model  # google-t5/t5-base
-        tok = self.tokenizer
-        use_task_specific_params(model, "translation_en_to_fr")
-
-        en_text = (
-            ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of'
-            " countless generations of stars: the oldest stars are seen as blue dots. "
-        )
-
-        input_ids = tok.encode(model.config.prefix + en_text, return_tensors="pt")
-        input_ids = input_ids.to(torch_device)
-
-        output = model.generate(
-            input_ids=input_ids,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=100,
-            no_repeat_ngram_size=3,
-            do_sample=False,
-            early_stopping=True,
-        )
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        new_truncated_translation = (
-            "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre "
-            "un "
-            "« portrait familial » de générations innombrables d’étoiles : les plus anciennes sont observées "
-            "sous forme "
-            "de points bleus."
-        )
-
-        self.assertEqual(translation, new_truncated_translation)
-
-    @slow
-    def test_translation_en_to_ro(self):
-        model = self.model
-        tok = self.tokenizer
-        use_task_specific_params(model, "translation_en_to_ro")
-        en_text = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
-        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
-
-        inputs = tok(model.config.prefix + en_text, return_tensors="pt").to(torch_device)
-        output = model.generate(**inputs)
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertEqual(translation, expected_translation)
-
-    @slow
-    def test_contrastive_search_t5(self):
-        article = (
-            " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-        article = "summarize: " + article.strip()
-        t5_tokenizer = AutoTokenizer.from_pretrained("flax-community/t5-base-cnn-dm")
-        t5_model = T5ForConditionalGeneration.from_pretrained("flax-community/t5-base-cnn-dm").to(torch_device)
-        input_ids = t5_tokenizer(
-            article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="pt"
-        ).input_ids.to(torch_device)
-
-        outputs = t5_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64)
-        generated_text = t5_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "Liana Barrientos has been married 10 times, nine of them in the Bronx. Her husbands filed for "
-                "permanent residence after the marriages, prosecutors say."
-            ],
-        )
-
-
-@require_torch
-class TestAsymmetricT5(unittest.TestCase):
-    def build_model_and_check_forward_pass(self, **kwargs):
-        tester = T5ModelTester(self, **kwargs)
-        config, *inputs = tester.prepare_config_and_inputs()
-        (
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = inputs
-        model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        # outputs = model(*inputs)
-        assert len(outputs) == 4
-        assert outputs["logits"].size() == (tester.batch_size, tester.decoder_seq_length, tester.vocab_size)
-        assert outputs["loss"].size() == ()
-        return model
-
-    def test_small_decoder(self):
-        # num_hidden_layers is passed to T5Config as num_layers
-        model = self.build_model_and_check_forward_pass(decoder_layers=1, num_hidden_layers=2)
-        assert len(model.encoder.block) == 2
-        assert len(model.decoder.block) == 1
-
-    def test_defaulting_to_symmetry(self):
-        # num_hidden_layers is passed to T5Config as num_layers
-        model = self.build_model_and_check_forward_pass(num_hidden_layers=2)
-        assert len(model.decoder.block) == len(model.encoder.block) == 2
diff --git a/tests/transformers/tests/models/vit/__init__.py b/tests/transformers/tests/models/vit/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/vit/test_modeling_vit.py b/tests/transformers/tests/models/vit/test_modeling_vit.py
deleted file mode 100644
index 3b86395362..0000000000
--- a/tests/transformers/tests/models/vit/test_modeling_vit.py
+++ /dev/null
@@ -1,328 +0,0 @@
-
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch ViT model. """
-
-
-import unittest
-
-from transformers import ViTConfig
-from transformers.testing_utils import (
-    require_accelerate,
-    require_torch,
-    require_torch_accelerator,
-    require_torch_fp16,
-    require_vision,
-    slow,
-    torch_device,
-)
-from transformers.utils import cached_property, is_torch_available, is_vision_available
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import ViTForImageClassification, ViTForMaskedImageModeling, ViTModel
-    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from transformers import ViTImageProcessor
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-
-class ViTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        scope=None,
-        encoder_stride=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.encoder_stride = encoder_stride
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ViTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ViTModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = ViTForMaskedImageModeling(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = ViTForMaskedImageModeling(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = ViTForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = ViTForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class ViTModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            ViTModel,
-            ViTForImageClassification,
-            ViTForMaskedImageModeling,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ViTModel, "image-classification": ViTForImageClassification}
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = ViTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/vit-base-patch16-224"
-        model = ViTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_torch
-@require_vision
-class ViTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions. The DINO model by Facebook AI leverages this
-        # to visualize self-attention on higher resolution images.
-        model = ViTModel.from_pretrained("facebook/dino-vits8").to(torch_device)
-
-        image_processor = ViTImageProcessor.from_pretrained("facebook/dino-vits8", size=480)
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(pixel_values, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 3601, 384))
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    @require_accelerate
-    @require_torch_accelerator
-    @require_torch_fp16
-    def test_inference_fp16(self):
-        r"""
-        A small test to make sure that inference work in half precision without any problem.
-        """
-        model = ViTModel.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto")
-        image_processor = self.default_image_processor
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(torch_device)
-
-        # forward pass to make sure inference works in fp16
-        with torch.no_grad():
-            _ = model(pixel_values)
-
diff --git a/tests/transformers/tests/models/wav2vec2/__init__.py b/tests/transformers/tests/models/wav2vec2/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/transformers/tests/models/wav2vec2/test_modeling_wav2vec2.py
deleted file mode 100644
index 00a44f2129..0000000000
--- a/tests/transformers/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ /dev/null
@@ -1,2000 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Wav2Vec2 model. """
-
-import gc
-import math
-import multiprocessing
-import os
-import pickle
-import tempfile
-import traceback
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from transformers import Wav2Vec2Config, is_torch_available
-from transformers.testing_utils import (
-    CaptureLogger,
-    backend_empty_cache,
-    is_pt_flax_cross_test,
-    is_pyctcdecode_available,
-    is_torchaudio_available,
-    require_pyctcdecode,
-    require_soundfile,
-    require_torch,
-    require_torchaudio,
-    run_test_in_subprocess,
-    slow,
-    torch_device,
-)
-from transformers.utils import is_torch_fx_available
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_torch_available():
-    import torch
-    from safetensors.torch import save_file as safe_save_file
-
-    from transformers import (
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2ForAudioFrameClassification,
-        Wav2Vec2ForCTC,
-        Wav2Vec2ForMaskedLM,
-        Wav2Vec2ForPreTraining,
-        Wav2Vec2ForSequenceClassification,
-        Wav2Vec2ForXVector,
-        Wav2Vec2Model,
-        Wav2Vec2Processor,
-    )
-    from transformers.models.wav2vec2.modeling_wav2vec2 import (
-        WAV2VEC2_ADAPTER_PT_FILE,
-        WAV2VEC2_ADAPTER_SAFE_FILE,
-        Wav2Vec2GumbelVectorQuantizer,
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
-
-
-if is_torchaudio_available():
-    import torchaudio
-
-
-if is_pyctcdecode_available():
-    import pyctcdecode.decoder
-
-    from transformers import Wav2Vec2ProcessorWithLM
-    from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
-
-
-if is_torch_fx_available():
-    from transformers.utils.fx import symbolic_trace
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
-        sample = next(iter(ds))
-
-        resampled_audio = torchaudio.functional.resample(
-            torch.tensor(sample["audio"]["array"]), 48_000, 16_000
-        ).numpy()
-
-        model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm").to(
-            torch_device
-        )
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled_audio, return_tensors="pt").input_values
-
-        with torch.no_grad():
-            logits = model(input_values.to(torch_device)).logits
-
-        # use a spawn pool, which should trigger a warning if different than fork
-        with CaptureLogger(pyctcdecode.decoder.logger) as cl, multiprocessing.get_context("spawn").Pool(1) as pool:
-            transcription = processor.batch_decode(logits.cpu().numpy(), pool).text
-
-        unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
-        unittest.TestCase().assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-
-        # force batch_decode to internally create a spawn pool, which should trigger a warning if different than fork
-        multiprocessing.set_start_method("spawn", force=True)
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl:
-            transcription = processor.batch_decode(logits.cpu().numpy()).text
-
-        unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
-        unittest.TestCase().assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-class Wav2Vec2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        num_adapter_layers=1,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return Wav2Vec2Config(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            do_stable_layer_norm=self.do_stable_layer_norm,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = Wav2Vec2Model(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        model = Wav2Vec2Model(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_for_ctc(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 2 * config.hidden_size
-        model = Wav2Vec2ForCTC(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Wav2Vec2Model(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_model_with_attn_adapter(self, config, input_values, attention_mask):
-        config.adapter_attn_dim = 16
-        model = Wav2Vec2ForCTC(config=config)
-
-        self.parent.assertIsNotNone(model._get_adapters())
-
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.output_seq_length, self.vocab_size))
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Wav2Vec2Model(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = Wav2Vec2ForCTC(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = Wav2Vec2ForSequenceClassification(config=config)
-        model.to(torch_device)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ForCTC(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ForSequenceClassification(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_xvector_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ForXVector(config=config)
-        model.to(torch_device)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(torch.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = Wav2Vec2ForCTC(config)
-        model.to(torch_device)
-        model.train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForSequenceClassification, Wav2Vec2ForPreTraining)
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "audio-classification": Wav2Vec2ForSequenceClassification,
-            "automatic-speech-recognition": Wav2Vec2ForCTC,
-            "feature-extraction": Wav2Vec2Model,
-            "fill-mask": Wav2Vec2ForMaskedLM,
-        }
-        if is_torch_available()
-        else {}
-    )
-    fx_compatible = True
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = Wav2Vec2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_for_ctc(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_for_ctc(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Wav2Vec2 has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Wav2Vec2 cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Wav2Vec2 has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_pt_to_flax(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = torch.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
-        self.assertIsNotNone(model)
-
-    # Wav2Vec2 cannot be torchscripted because of group norm.
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        # TODO: fix it
-        self.skipTest("torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
-
-        if not is_torch_fx_available() or not self.fx_compatible:
-            return
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            try:
-                input_names = [
-                    "attention_mask",
-                    "bbox",
-                    "input_features",
-                    "input_ids",
-                    "input_values",
-                    "pixel_values",
-                    "token_type_ids",
-                    "visual_feats",
-                    "visual_pos",
-                ]
-
-                labels = inputs.get("labels", None)
-                start_positions = inputs.get("start_positions", None)
-                end_positions = inputs.get("end_positions", None)
-                if labels is not None:
-                    input_names.append("labels")
-                if start_positions is not None:
-                    input_names.append("start_positions")
-                if end_positions is not None:
-                    input_names.append("end_positions")
-
-                filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                input_names = list(filtered_inputs.keys())
-
-                model_output = model(**filtered_inputs)
-
-                if (
-                    isinstance(model, Wav2Vec2ForSequenceClassification)
-                    and not hasattr(model.config, "problem_type")
-                    or model.config.problem_type is None
-                ):
-                    model.config.problem_type = "single_label_classification"
-
-                traced_model = symbolic_trace(model, input_names)
-                traced_output = traced_model(**filtered_inputs)
-
-            except Exception as e:
-                self.fail(f"Couldn't trace module: {e}")
-
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
-
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
-
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
-
-            # Test that the model can be serialized and restored properly
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
-                try:
-                    with open(pkl_file_name, "wb") as f:
-                        pickle.dump(traced_model, f)
-                    with open(pkl_file_name, "rb") as f:
-                        loaded = pickle.load(f)
-                except Exception as e:
-                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
-
-                loaded_output = loaded(**filtered_inputs)
-                loaded_output = flatten_output(loaded_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], loaded_output[i]),
-                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
-    @unittest.skip(
-        "Need to investigate why config.do_stable_layer_norm is set to False here when it doesn't seem to be supported"
-    )
-    def test_flax_from_pt_safetensors(self):
-        return
-
-
-@require_torch
-class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Wav2Vec2ForCTC,
-            Wav2Vec2Model,
-            Wav2Vec2ForMaskedLM,
-            Wav2Vec2ForSequenceClassification,
-            Wav2Vec2ForPreTraining,
-            Wav2Vec2ForAudioFrameClassification,
-            Wav2Vec2ForXVector,
-        )
-        if is_torch_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = Wav2Vec2ModelTester(
-            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
-        )
-        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_model_with_attn_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_attn_adapter(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Wav2Vec2 has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Wav2Vec2 cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Wav2Vec2 has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_common_attributes(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = torch.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_model_for_pretraining(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = Wav2Vec2ForPreTraining(config).to(torch_device)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices)
-
-        mask_time_indices = torch.from_numpy(mask_time_indices).to(torch_device)
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-
-        loss = model(
-            inputs_dict["input_values"],
-            attention_mask=inputs_dict["attention_mask"],
-            mask_time_indices=mask_time_indices,
-            sampled_negative_indices=sampled_negative_indices,
-        ).loss
-
-        # more losses
-        mask_time_indices[:, : mask_time_indices.shape[-1] // 2] = True
-
-        sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices.cpu().numpy())
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        loss_more_masked = model(
-            inputs_dict["input_values"],
-            attention_mask=inputs_dict["attention_mask"],
-            mask_time_indices=mask_time_indices,
-            sampled_negative_indices=sampled_negative_indices,
-        ).loss
-
-        # loss_more_masked has to be bigger or equal loss since more masked inputs have to be predicted
-        self.assertTrue(loss.detach().item() <= loss_more_masked.detach().item())
-
-    def test_mask_feature_prob_ctc(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_feature_prob_ctc_single_batch(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2",
-            mask_time_prob=0.2,
-            mask_feature_prob=0.2,
-            mask_time_length=2,
-            mask_feature_length=2,
-        )
-        model.to(torch_device).train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
-        logits = model(
-            input_values=batch["input_values"].to(torch_device),
-            attention_mask=batch["attention_mask"].to(torch_device),
-        ).logits
-
-        self.assertEqual(logits.shape, (1, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_load_and_set_attn_adapter(self):
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        def get_logits(model, input_features):
-            model = model.to(torch_device)
-            batch = processor(
-                input_features,
-                padding=True,
-                sampling_rate=processor.feature_extractor.sampling_rate,
-                return_tensors="pt",
-            )
-
-            with torch.no_grad():
-                logits = model(
-                    input_values=batch["input_values"].to(torch_device),
-                    attention_mask=batch["attention_mask"].to(torch_device),
-                ).logits
-            return logits
-
-        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
-
-        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="it")
-
-        logits = get_logits(model, input_features)
-
-        model_2 = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
-        model_2.load_adapter("it")
-
-        logits_2 = get_logits(model_2, input_features)
-
-        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
-
-    # test that loading adapter weights with mismatched vocab sizes can be loaded
-    def test_load_target_lang_with_mismatched_size(self):
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        def get_logits(model, input_features):
-            model = model.to(torch_device)
-            batch = processor(
-                input_features,
-                padding=True,
-                sampling_rate=processor.feature_extractor.sampling_rate,
-                return_tensors="pt",
-            )
-
-            with torch.no_grad():
-                logits = model(
-                    input_values=batch["input_values"].to(torch_device),
-                    attention_mask=batch["attention_mask"].to(torch_device),
-                ).logits
-            return logits
-
-        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
-
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="fr", ignore_mismatched_sizes=True
-        )
-
-        logits = get_logits(model, input_features)
-
-        model_2 = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
-        model_2.load_adapter("fr")
-
-        logits_2 = get_logits(model_2, input_features)
-
-        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
-
-    def test_load_attn_adapter(self):
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        def get_logits(model, input_features):
-            model = model.to(torch_device)
-            batch = processor(
-                input_features,
-                padding=True,
-                sampling_rate=processor.feature_extractor.sampling_rate,
-                return_tensors="pt",
-            )
-
-            with torch.no_grad():
-                logits = model(
-                    input_values=batch["input_values"].to(torch_device),
-                    attention_mask=batch["attention_mask"].to(torch_device),
-                ).logits
-            return logits
-
-        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
-
-        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", adapter_attn_dim=16)
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            model.save_pretrained(tempdir)
-            model = Wav2Vec2ForCTC.from_pretrained(tempdir)
-
-            logits = get_logits(model, input_features)
-            adapter_weights = model._get_adapters()
-
-            # save safe weights
-            safe_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_SAFE_FILE.format("eng"))
-            safe_save_file(adapter_weights, safe_filepath, metadata={"format": "pt"})
-
-            model.load_adapter("eng")
-            model.load_adapter("eng", use_safetensors=True)
-
-            with self.assertRaises(OSError):
-                model.load_adapter("eng", use_safetensors=False)
-            with self.assertRaises(Exception):
-                model.load_adapter("ita", use_safetensors=True)
-            logits_2 = get_logits(model, input_features)
-
-            self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            model.save_pretrained(tempdir)
-            model = Wav2Vec2ForCTC.from_pretrained(tempdir)
-
-            logits = get_logits(model, input_features)
-            adapter_weights = model._get_adapters()
-
-            # save pt weights
-            pt_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_PT_FILE.format("eng"))
-            torch.save(adapter_weights, pt_filepath)
-
-            model.load_adapter("eng")
-            model.load_adapter("eng", use_safetensors=False)
-
-            with self.assertRaises(OSError):
-                model.load_adapter("eng", use_safetensors=True)
-
-            logits_2 = get_logits(model, input_features)
-
-            self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
-
-        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
-        logits = get_logits(model, input_features)
-
-        model.load_adapter("eng")
-        model.load_adapter("eng", use_safetensors=False)
-        model.load_adapter("eng", use_safetensors=True)
-
-        logits_2 = get_logits(model, input_features)
-
-        self.assertTrue(torch.allclose(logits, logits_2, atol=1e-3))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
-        self.assertIsNotNone(model)
-
-
-@require_torch
-class Wav2Vec2UtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = torch.from_numpy(mask).to(torch_device)
-
-            num_masks = torch.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = torch.from_numpy(mask).to(torch_device)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
-
-    def test_compute_perplexity(self):
-        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
-        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
-
-        # mask half of the input
-        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
-        mask[0] = 0
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
-        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-        sequence = torch.div(
-            torch.arange(sequence_length * hidden_size, device=torch_device), hidden_size, rounding_mode="floor"
-        )
-        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertEqual(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-    def test_sample_negatives_with_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        # second half of last input tensor is padded
-        mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
-        mask[-1, sequence_length // 2 :] = 0
-
-        sequence = torch.div(
-            torch.arange(sequence_length * hidden_size, device=torch_device), hidden_size, rounding_mode="floor"
-        )
-        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
-
-        # replace masked feature vectors with -100 to test that those are not sampled
-        features = torch.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask.cpu().numpy()
-        )
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-
-        self.assertTrue((negatives >= 0).all().item())
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertEqual(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-
-@require_torch
-@require_soundfile
-@slow
-class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_normal(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        model.to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
-            " him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    @unittest.skipIf(torch_device != "cpu", "cannot make deterministic on GPU")
-    def test_inference_integration(self):
-        model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
-        model.to(torch_device)
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        np.random.seed(4)
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        mask_time_indices = torch.from_numpy(mask_time_indices).to(torch_device)
-
-        with torch.no_grad():
-            outputs = model(
-                inputs_dict.input_values.to(torch_device),
-                mask_time_indices=mask_time_indices,
-            )
-
-        # compute cosine similarity
-        cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked = cosine_sim[mask_time_indices]
-
-        # cosine similarity of model is all > 0.5 as model is
-        # pre-trained on contrastive loss
-        # fmt: off
-        expected_cosine_sim_masked = torch.tensor([
-            0.8523, 0.5860, 0.6905, 0.5557, 0.7456, 0.5249, 0.6639, 0.7654, 0.7565,
-            0.8167, 0.8222, 0.7960, 0.8034, 0.8166, 0.8310, 0.8263, 0.8274, 0.8258,
-            0.8179, 0.8412, 0.8536, 0.5098, 0.4728, 0.6461, 0.4498, 0.6002, 0.5774,
-            0.6457, 0.7123, 0.5668, 0.6866, 0.4960, 0.6293, 0.7423, 0.7419, 0.7526,
-            0.7768, 0.4898, 0.5393, 0.8183
-        ], device=torch_device)
-        # fmt: on
-
-        self.assertTrue(torch.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3))
-
-    def test_inference_pretrained(self):
-        model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
-        model.to(torch_device)
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-base", return_attention_mask=True
-        )
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        torch.manual_seed(0)
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        mask_time_indices = torch.from_numpy(mask_time_indices).to(torch_device)
-
-        with torch.no_grad():
-            outputs = model(
-                inputs_dict.input_values.to(torch_device),
-                attention_mask=inputs_dict.attention_mask.to(torch_device),
-                mask_time_indices=mask_time_indices,
-            )
-
-        # compute cosine similarity
-        cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked = cosine_sim[mask_time_indices]
-
-        # ... now compare to randomly initialized model
-
-        config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base")
-        model_rand = Wav2Vec2ForPreTraining(config).to(torch_device).eval()
-
-        with torch.no_grad():
-            outputs_rand = model_rand(
-                inputs_dict.input_values.to(torch_device),
-                attention_mask=inputs_dict.attention_mask.to(torch_device),
-                mask_time_indices=mask_time_indices,
-            )
-
-        # compute cosine similarity
-        cosine_sim_rand = torch.cosine_similarity(
-            outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1
-        )
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices]
-
-        # a pretrained wav2vec2 model has learned to predict the quantized latent states
-        # => the cosine similarity between quantized states and predicted states > 0.5
-        # a random wav2vec2 model has not learned to predict the quantized latent states
-        # => the cosine similarity between quantized states and predicted states is very likely < 0.1
-        self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
-
-    @unittest.skipIf(torch_device != "cpu", "cannot make deterministic on GPU")
-    def test_loss_pretraining(self):
-        model = Wav2Vec2ForPreTraining.from_pretrained(
-            "facebook/wav2vec2-base",
-            attention_dropout=0.0,
-            feat_proj_dropout=0.0,
-            hidden_dropout=0.0,
-            layerdrop=0.0,
-        )
-        model.to(torch_device).train()
-
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-base", return_attention_mask=True
-        )
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        torch.manual_seed(0)
-        np.random.seed(0)
-
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        sampled_negative_indices = _sample_negative_indices(
-            mask_time_indices.shape, model.config.num_negatives, mask_time_indices
-        )
-
-        mask_time_indices = torch.from_numpy(mask_time_indices).to(torch_device)
-        sampled_negative_indices = torch.from_numpy(sampled_negative_indices).to(torch_device)
-
-        with torch.no_grad():
-            outputs = model(
-                inputs_dict.input_values.to(torch_device),
-                attention_mask=inputs_dict.attention_mask.to(torch_device),
-                mask_time_indices=mask_time_indices,
-                sampled_negative_indices=sampled_negative_indices,
-            )
-
-        # check diversity loss
-        num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
-        diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
-        self.assertTrue(abs(diversity_loss.item() - 0.9538) < 1e-3)
-
-        # check overall loss (contrastive loss + diversity loss)
-        expected_loss = 116.7094
-
-        self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3)
-
-    def test_inference_keyword_spotting(self):
-        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ks").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ks")
-        input_data = self._load_superb("ks", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [7, 6, 10, 9]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([6.1186, 11.8961, 10.2931, 6.0898], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_intent_classification(self):
-        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("ic", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-
-        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
-        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
-        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)
-
-        expected_labels_action = [0, 0, 2, 3]
-        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
-        expected_labels_object = [3, 10, 3, 4]
-        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
-        expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)
-
-        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
-        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
-        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
-
-        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
-        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
-
-    def test_inference_speaker_identification(self):
-        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-sid").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-sid")
-        input_data = self._load_superb("si", 4)
-
-        output_logits = []
-        with torch.no_grad():
-            for example in input_data["speech"]:
-                input = processor(example, return_tensors="pt", padding=True)
-                output = model(input.input_values.to(torch_device), attention_mask=None)
-                output_logits.append(output.logits[0])
-        output_logits = torch.stack(output_logits)
-        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)
-
-        expected_labels = [251, 1, 1, 3]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_emotion_recognition(self):
-        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
-        input_data = self._load_superb("er", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)
-
-        expected_labels = [1, 1, 2, 2]
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor([2.1722, 3.0779, 8.0287, 6.6797], device=torch_device)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_phoneme_recognition(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = torch.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
-            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ"
-            " n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
-            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː"
-            " v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ"
-            " ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
-            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
-        ]
-        # should correspond to =>:
-        # [
-        # "a man said to the universe sir i exist",
-        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-        # "his instant panic was followed by a small sharp blow high on his chest",
-        # ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    @require_pyctcdecode
-    @require_torchaudio
-    def test_wav2vec2_with_lm(self):
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
-        sample = next(iter(ds))
-
-        resampled_audio = torchaudio.functional.resample(
-            torch.tensor(sample["audio"]["array"]), 48_000, 16_000
-        ).numpy()
-
-        model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm").to(
-            torch_device
-        )
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled_audio, return_tensors="pt").input_values
-
-        with torch.no_grad():
-            logits = model(input_values.to(torch_device)).logits
-
-        transcription = processor.batch_decode(logits.cpu().numpy()).text
-
-        self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-
-    @require_pyctcdecode
-    @require_torchaudio
-    def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
-        sample = next(iter(ds))
-
-        resampled_audio = torchaudio.functional.resample(
-            torch.tensor(sample["audio"]["array"]), 48_000, 16_000
-        ).numpy()
-
-        model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm").to(
-            torch_device
-        )
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled__audio, return_tensors="pt").input_values
-
-        with torch.no_grad():
-            logits = model(input_values.to(torch_device)).logits
-
-        # test user-managed pool
-        with multiprocessing.get_context("fork").Pool(2) as pool:
-            transcription = processor.batch_decode(logits.cpu().numpy(), pool).text
-
-        self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-
-        # user-managed pool + num_processes should trigger a warning
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
-            2
-        ) as pool:
-            transcription = processor.batch_decode(logits.cpu().numpy(), pool, num_processes=2).text
-
-        self.assertIn("num_process", cl.out)
-        self.assertIn("it will be ignored", cl.out)
-
-        self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-
-    @require_pyctcdecode
-    @require_torchaudio
-    def test_wav2vec2_with_lm_invalid_pool(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None)
-
-    def test_inference_diarization(self):
-        model = Wav2Vec2ForAudioFrameClassification.from_pretrained("anton-l/wav2vec2-base-superb-sd").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/wav2vec2-base-superb-sd")
-        input_data = self._load_superb("sd", 4)
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-
-        input_values = inputs.input_values.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
-        with torch.no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        # labels is a one-hot array of shape (num_frames, num_speakers)
-        labels = (outputs.logits > 0).long()
-
-        # s3prl logits for the same batch
-        expected_logits = torch.tensor(
-            [
-                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
-                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
-                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
-                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
-            ],
-            device=torch_device,
-        )
-        self.assertEqual(labels[0, :, 0].sum(), 555)
-        self.assertEqual(labels[0, :, 1].sum(), 299)
-        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
-
-    def test_inference_speaker_verification(self):
-        model = Wav2Vec2ForXVector.from_pretrained("anton-l/wav2vec2-base-superb-sv").to(torch_device)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/wav2vec2-base-superb-sv")
-        input_data = self._load_superb("si", 4)
-
-        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)
-        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T
-
-        with torch.no_grad():
-            input_values = inputs.input_values.to(torch_device)
-            attention_mask = inputs.attention_mask.to(torch_device)
-            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1).cpu()
-
-        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
-        # id10002 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
-        # id10006 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
-        # id10002 vs id10004
-        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
-
-        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
-
-    @require_torchaudio
-    def test_inference_mms_1b_all(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all").to(torch_device)
-        processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all")
-
-        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}
-
-        def run_model(lang):
-            ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True)
-            sample = next(iter(ds))
-
-            wav2vec2_lang = LANG_MAP[lang]
-
-            model.load_adapter(wav2vec2_lang)
-            processor.tokenizer.set_target_lang(wav2vec2_lang)
-
-            resampled_audio = torchaudio.functional.resample(
-                torch.tensor(sample["audio"]["array"]), 48_000, 16_000
-            ).numpy()
-
-            inputs = processor(resampled_audio, sampling_rate=16_000, return_tensors="pt")
-            input_values = inputs.input_values.to(torch_device)
-            attention_mask = inputs.attention_mask.to(torch_device)
-
-            with torch.no_grad():
-                outputs = model(input_values, attention_mask=attention_mask).logits
-
-            ids = torch.argmax(outputs, dim=-1)[0]
-
-            transcription = processor.decode(ids)
-            return transcription
-
-        TRANSCRIPTIONS = {
-            "it": "il libro ha suscitato molte polemiche a causa dei suoi contenuti",
-            "es": "habitan aguas poco profundas y rocosas",
-            "fr": "ce dernier est volé tout au long de l'histoire romaine",
-            "en": "joe keton disapproved of films and buster also had reservations about the media",
-        }
-
-        for lang in LANG_MAP.keys():
-            assert run_model(lang) == TRANSCRIPTIONS[lang]
diff --git a/tests/transformers/tests/test_configuration_common.py b/tests/transformers/tests/test_configuration_common.py
deleted file mode 100644
index 5fb93f71eb..0000000000
--- a/tests/transformers/tests/test_configuration_common.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import json
-import os
-import tempfile
-
-from transformers import is_torch_available
-
-from .test_configuration_utils import config_common_kwargs
-
-
-class ConfigTester(object):
-    def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
-        self.parent = parent
-        self.config_class = config_class
-        self.has_text_modality = has_text_modality
-        self.inputs_dict = kwargs
-        self.common_properties = common_properties
-
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        common_properties = (
-            ["hidden_size", "num_attention_heads", "num_hidden_layers"]
-            if self.common_properties is None
-            else self.common_properties
-        )
-
-        # Add common fields for text models
-        if self.has_text_modality:
-            common_properties.extend(["vocab_size"])
-
-        # Test that config has the common properties as getters
-        for prop in common_properties:
-            self.parent.assertTrue(hasattr(config, prop), msg=f"`{prop}` does not exist")
-
-        # Test that config has the common properties as setter
-        for idx, name in enumerate(common_properties):
-            try:
-                setattr(config, name, idx)
-                self.parent.assertEqual(
-                    getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
-                )
-            except NotImplementedError:
-                # Some models might not be able to implement setters for common_properties
-                # In that case, a NotImplementedError is raised
-                pass
-
-        # Test if config class can be called with Config(prop_name=..)
-        for idx, name in enumerate(common_properties):
-            try:
-                config = self.config_class(**{name: idx})
-                self.parent.assertEqual(
-                    getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
-                )
-            except NotImplementedError:
-                # Some models might not be able to implement setters for common_properties
-                # In that case, a NotImplementedError is raised
-                pass
-
-    def create_and_test_config_to_json_string(self):
-        config = self.config_class(**self.inputs_dict)
-        obj = json.loads(config.to_json_string())
-        for key, value in self.inputs_dict.items():
-            self.parent.assertEqual(obj[key], value)
-
-    def create_and_test_config_to_json_file(self):
-        config_first = self.config_class(**self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "config.json")
-            config_first.to_json_file(json_file_path)
-            config_second = self.config_class.from_json_file(json_file_path)
-
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def create_and_test_config_from_and_save_pretrained(self):
-        config_first = self.config_class(**self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            config_first.save_pretrained(tmpdirname)
-            config_second = self.config_class.from_pretrained(tmpdirname)
-
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def create_and_test_config_from_and_save_pretrained_subfolder(self):
-        config_first = self.config_class(**self.inputs_dict)
-
-        subfolder = "test"
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sub_tmpdirname = os.path.join(tmpdirname, subfolder)
-            config_first.save_pretrained(sub_tmpdirname)
-            config_second = self.config_class.from_pretrained(tmpdirname, subfolder=subfolder)
-
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def create_and_test_config_with_num_labels(self):
-        config = self.config_class(**self.inputs_dict, num_labels=5)
-        self.parent.assertEqual(len(config.id2label), 5)
-        self.parent.assertEqual(len(config.label2id), 5)
-
-        config.num_labels = 3
-        self.parent.assertEqual(len(config.id2label), 3)
-        self.parent.assertEqual(len(config.label2id), 3)
-
-    def check_config_can_be_init_without_params(self):
-        if self.config_class.is_composition:
-            with self.parent.assertRaises(ValueError):
-                config = self.config_class()
-        else:
-            config = self.config_class()
-            self.parent.assertIsNotNone(config)
-
-    def check_config_arguments_init(self):
-        kwargs = copy.deepcopy(config_common_kwargs)
-        config = self.config_class(**kwargs)
-        wrong_values = []
-        for key, value in config_common_kwargs.items():
-            if key == "torch_dtype":
-                if not is_torch_available():
-                    continue
-                else:
-                    import torch
-
-                    if config.torch_dtype != torch.float16:
-                        wrong_values.append(("torch_dtype", config.torch_dtype, torch.float16))
-            elif getattr(config, key) != value:
-                wrong_values.append((key, getattr(config, key), value))
-
-        if len(wrong_values) > 0:
-            errors = "\n".join([f"- {v[0]}: got {v[1]} instead of {v[2]}" for v in wrong_values])
-            raise ValueError(f"The following keys were not properly set in the config:\n{errors}")
-
-    def run_common_tests(self):
-        self.create_and_test_config_common_properties()
-        self.create_and_test_config_to_json_string()
-        self.create_and_test_config_to_json_file()
-        self.create_and_test_config_from_and_save_pretrained()
-        self.create_and_test_config_from_and_save_pretrained_subfolder()
-        self.create_and_test_config_with_num_labels()
-        self.check_config_can_be_init_without_params()
-        self.check_config_arguments_init()
diff --git a/tests/transformers/tests/test_configuration_utils.py b/tests/transformers/tests/test_configuration_utils.py
deleted file mode 100644
index b7323c5905..0000000000
--- a/tests/transformers/tests/test_configuration_utils.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import sys
-import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
-
-from huggingface_hub import HfFolder, delete_repo
-from requests.exceptions import HTTPError
-from transformers import AutoConfig, BertConfig, GPT2Config
-from transformers.configuration_utils import PretrainedConfig
-from transformers.testing_utils import TOKEN, USER, is_staging_test
-
-
-sys.path.append(str(Path(__file__).parent.parent / "utils"))
-
-from test_module.custom_configuration import CustomConfig  # noqa E402
-
-
-config_common_kwargs = {
-    "return_dict": False,
-    "output_hidden_states": True,
-    "output_attentions": True,
-    "torchscript": True,
-    "torch_dtype": "float16",
-    "use_bfloat16": True,
-    "tf_legacy_loss": True,
-    "pruned_heads": {"a": 1},
-    "tie_word_embeddings": False,
-    "is_decoder": True,
-    "cross_attention_hidden_size": 128,
-    "add_cross_attention": True,
-    "tie_encoder_decoder": True,
-    "max_length": 50,
-    "min_length": 3,
-    "do_sample": True,
-    "early_stopping": True,
-    "num_beams": 3,
-    "num_beam_groups": 3,
-    "diversity_penalty": 0.5,
-    "temperature": 2.0,
-    "top_k": 10,
-    "top_p": 0.7,
-    "typical_p": 0.2,
-    "repetition_penalty": 0.8,
-    "length_penalty": 0.8,
-    "no_repeat_ngram_size": 5,
-    "encoder_no_repeat_ngram_size": 5,
-    "bad_words_ids": [1, 2, 3],
-    "num_return_sequences": 3,
-    "chunk_size_feed_forward": 5,
-    "output_scores": True,
-    "return_dict_in_generate": True,
-    "forced_bos_token_id": 2,
-    "forced_eos_token_id": 3,
-    "remove_invalid_values": True,
-    "architectures": ["BertModel"],
-    "finetuning_task": "translation",
-    "id2label": {0: "label"},
-    "label2id": {"label": "0"},
-    "tokenizer_class": "BertTokenizerFast",
-    "prefix": "prefix",
-    "bos_token_id": 6,
-    "pad_token_id": 7,
-    "eos_token_id": 8,
-    "sep_token_id": 9,
-    "decoder_start_token_id": 10,
-    "exponential_decay_length_penalty": (5, 1.01),
-    "suppress_tokens": [0, 1],
-    "begin_suppress_tokens": 2,
-    "task_specific_params": {"translation": "some_params"},
-    "problem_type": "regression",
-}
-
-
-@is_staging_test
-class ConfigPushToHubTester(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._token = TOKEN
-        HfFolder.save_token(TOKEN)
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            delete_repo(token=cls._token, repo_id="test-config")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="valid_org/test-config-org")
-        except HTTPError:
-            pass
-
-        try:
-            delete_repo(token=cls._token, repo_id="test-dynamic-config")
-        except HTTPError:
-            pass
-
-    def test_push_to_hub(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        config.push_to_hub("test-config", token=self._token)
-
-        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="test-config")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(tmp_dir, repo_id="test-config", push_to_hub=True, token=self._token)
-
-        new_config = BertConfig.from_pretrained(f"{USER}/test-config")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_in_organization(self):
-        config = BertConfig(
-            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-        )
-        config.push_to_hub("valid_org/test-config-org", use_auth_token=self._token)
-
-        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-        # Reset repo
-        delete_repo(token=self._token, repo_id="valid_org/test-config-org")
-
-        # Push to hub via save_pretrained
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(
-                tmp_dir, repo_id="valid_org/test-config-org", push_to_hub=True, use_auth_token=self._token
-            )
-
-        new_config = BertConfig.from_pretrained("valid_org/test-config-org")
-        for k, v in config.to_dict().items():
-            if k != "transformers_version":
-                self.assertEqual(v, getattr(new_config, k))
-
-    def test_push_to_hub_dynamic_config(self):
-        CustomConfig.register_for_auto_class()
-        config = CustomConfig(attribute=42)
-
-        config.push_to_hub("test-dynamic-config", use_auth_token=self._token)
-
-        # This has added the proper auto_map field to the config
-        self.assertDictEqual(config.auto_map, {"AutoConfig": "custom_configuration.CustomConfig"})
-
-        new_config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-config", trust_remote_code=True)
-        # Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
-        self.assertEqual(new_config.__class__.__name__, "CustomConfig")
-        self.assertEqual(new_config.attribute, 42)
-
-
-class ConfigTestUtils(unittest.TestCase):
-    def test_config_from_string(self):
-        c = GPT2Config()
-
-        # attempt to modify each of int/float/bool/str config records and verify they were updated
-        n_embd = c.n_embd + 1  # int
-        resid_pdrop = c.resid_pdrop + 1.0  # float
-        scale_attn_weights = not c.scale_attn_weights  # bool
-        summary_type = c.summary_type + "foo"  # str
-        c.update_from_string(
-            f"n_embd={n_embd},resid_pdrop={resid_pdrop},scale_attn_weights={scale_attn_weights},summary_type={summary_type}"
-        )
-        self.assertEqual(n_embd, c.n_embd, "mismatch for key: n_embd")
-        self.assertEqual(resid_pdrop, c.resid_pdrop, "mismatch for key: resid_pdrop")
-        self.assertEqual(scale_attn_weights, c.scale_attn_weights, "mismatch for key: scale_attn_weights")
-        self.assertEqual(summary_type, c.summary_type, "mismatch for key: summary_type")
-
-    def test_config_common_kwargs_is_complete(self):
-        base_config = PretrainedConfig()
-        missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
-        # If this part of the test fails, you have arguments to addin config_common_kwargs above.
-        self.assertListEqual(
-            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
-        )
-        keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
-        if len(keys_with_defaults) > 0:
-            raise ValueError(
-                "The following keys are set with the default values in"
-                " `test_configuration_common.config_common_kwargs` pick another value for them:"
-                f" {', '.join(keys_with_defaults)}."
-            )
-
-    def test_nested_config_load_from_dict(self):
-        config = AutoConfig.from_pretrained(
-            "hf-internal-testing/tiny-random-CLIPModel", text_config={"num_hidden_layers": 2}
-        )
-        self.assertNotIsInstance(config.text_config, dict)
-        self.assertEqual(config.text_config.__class__.__name__, "CLIPTextConfig")
-
-    def test_from_pretrained_subfolder(self):
-        with self.assertRaises(OSError):
-            # config is in subfolder, the following should not work without specifying the subfolder
-            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
-
-        config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder", subfolder="bert")
-
-        self.assertIsNotNone(config)
-
-    def test_cached_files_are_used_when_internet_is_down(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_legacy_load_from_url(self):
-        # This test is for deprecated behavior and can be removed in v5
-        _ = BertConfig.from_pretrained(
-            "https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/config.json"
-        )
-
-    def test_local_versioning(self):
-        configuration = AutoConfig.from_pretrained("bert-base-cased")
-        configuration.configuration_files = ["config.4.0.0.json"]
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            configuration.save_pretrained(tmp_dir)
-            configuration.hidden_size = 2
-            json.dump(configuration.to_dict(), open(os.path.join(tmp_dir, "config.4.0.0.json"), "w"))
-
-            # This should pick the new configuration file as the version of Transformers is > 4.0.0
-            new_configuration = AutoConfig.from_pretrained(tmp_dir)
-            self.assertEqual(new_configuration.hidden_size, 2)
-
-            # Will need to be adjusted if we reach v42 and this test is still here.
-            # Should pick the old configuration file as the version of Transformers is < 4.42.0
-            configuration.configuration_files = ["config.42.0.0.json"]
-            configuration.hidden_size = 768
-            configuration.save_pretrained(tmp_dir)
-            shutil.move(os.path.join(tmp_dir, "config.4.0.0.json"), os.path.join(tmp_dir, "config.42.0.0.json"))
-            new_configuration = AutoConfig.from_pretrained(tmp_dir)
-            self.assertEqual(new_configuration.hidden_size, 768)
-
-    def test_repo_versioning_before(self):
-        # This repo has two configuration files, one for v4.0.0 and above with a different hidden size.
-        repo = "hf-internal-testing/test-two-configs"
-
-        import transformers as new_transformers
-
-        new_transformers.configuration_utils.__version__ = "v4.0.0"
-        new_configuration, kwargs = new_transformers.models.auto.AutoConfig.from_pretrained(
-            repo, return_unused_kwargs=True
-        )
-        self.assertEqual(new_configuration.hidden_size, 2)
-        # This checks `_configuration_file` ia not kept in the kwargs by mistake.
-        self.assertDictEqual(kwargs, {})
-
-        # Testing an older version by monkey-patching the version in the module it's used.
-        import transformers as old_transformers
-
-        old_transformers.configuration_utils.__version__ = "v3.0.0"
-        old_configuration = old_transformers.models.auto.AutoConfig.from_pretrained(repo)
-        self.assertEqual(old_configuration.hidden_size, 768)
diff --git a/tests/transformers/tests/test_modeling_common.py b/tests/transformers/tests/test_modeling_common.py
deleted file mode 100755
index 900466aaa2..0000000000
--- a/tests/transformers/tests/test_modeling_common.py
+++ /dev/null
@@ -1,2294 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import copy
-import gc
-import inspect
-import os
-import os.path
-import pickle
-import random
-import re
-import tempfile
-import warnings
-from collections import defaultdict
-from typing import Dict, List, Tuple
-
-import numpy as np
-from pytest import mark
-from transformers import (
-    AutoModel,
-    AutoModelForSequenceClassification,
-    PretrainedConfig,
-    is_torch_available,
-    logging,
-)
-from transformers.models.auto import get_values
-from transformers.models.auto.modeling_auto import (
-    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES,
-    MODEL_FOR_BACKBONE_MAPPING_NAMES,
-    MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES,
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
-    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
-    MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
-    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
-    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
-    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_MAPPING_NAMES,
-)
-from transformers.testing_utils import (
-    CaptureLogger,
-    require_accelerate,
-    require_safetensors,
-    require_torch,
-    require_torch_gpu,
-    require_torch_multi_gpu,
-)
-from transformers.utils import (
-    CONFIG_NAME,
-    GENERATION_CONFIG_NAME,
-    WEIGHTS_NAME,
-    is_accelerate_available,
-    is_torch_fx_available,
-)
-
-from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-
-if is_accelerate_available():
-    from accelerate.utils import compute_module_sizes
-
-
-if is_torch_available():
-    import torch
-    from safetensors.torch import save_file as safe_save_file
-    from torch import nn
-    from transformers import MODEL_MAPPING, AdaptiveEmbedding
-    from transformers.pytorch_utils import id_tensor_storage
-
-
-if is_torch_fx_available():
-    from transformers.utils.fx import symbolic_trace
-
-torch_device = "hpu"
-adapt_transformers_to_gaudi()
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-            setattr(configs_no_init, key, 1e-10)
-        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
-            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
-            setattr(configs_no_init, key, no_init_subconfig)
-    return configs_no_init
-
-
-def _mock_init_weights(self, module):
-    for name, param in module.named_parameters(recurse=False):
-        # Use the first letter of the name to get a value and go from a <> -13 to z <> 12
-        value = ord(name[0].lower()) - 110
-        param.data.fill_(value)
-
-
-def _mock_all_init_weights(self):
-    # Prune heads if needed
-    if self.config.pruned_heads:
-        self.prune_heads(self.config.pruned_heads)
-
-    import transformers.modeling_utils
-
-    if transformers.modeling_utils._init_weights:
-        for module in self.modules():
-            module._is_hf_initialized = False
-        # Initialize weights
-        self.apply(self._initialize_weights)
-
-        # Tie weights should be skipped when not initializing all weights
-        # since from_pretrained(...) calls tie weights anyways
-        self.tie_weights()
-
-
-@require_torch
-class ModelTesterMixin:
-    model_tester = None
-    all_model_classes = ()
-    all_generative_model_classes = ()
-    fx_compatible = False
-    test_torchscript = True
-    test_pruning = True
-    test_resize_embeddings = True
-    test_resize_position_embeddings = False
-    test_head_masking = True
-    test_mismatched_shapes = True
-    test_missing_keys = True
-    test_model_parallel = False
-    is_encoder_decoder = False
-    has_attentions = True
-    model_split_percents = [0.5, 0.7, 0.9]
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
-            inputs_dict = {
-                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
-                if isinstance(v, torch.Tensor) and v.ndim > 1
-                else v
-                for k, v in inputs_dict.items()
-            }
-        elif model_class.__name__ in get_values(MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES):
-            inputs_dict.pop("attention_mask")
-
-        if return_labels:
-            if model_class.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
-                inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class.__name__ in [
-                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
-                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
-            ]:
-                inputs_dict["start_positions"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-                inputs_dict["end_positions"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-            elif model_class.__name__ in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES),
-            ]:
-                inputs_dict["labels"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-            elif model_class.__name__ in [
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES),
-                *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
-            ]:
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-            elif model_class.__name__ in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES):
-                num_patches = self.model_tester.image_size // self.model_tester.patch_size
-                inputs_dict["bool_masked_pos"] = torch.zeros(
-                    (self.model_tester.batch_size, num_patches**2), dtype=torch.long, device=torch_device
-                )
-            elif model_class.__name__ in get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES):
-                batch_size, num_channels, height, width = inputs_dict["pixel_values"].shape
-                inputs_dict["labels"] = torch.zeros(
-                    [self.model_tester.batch_size, height, width], device=torch_device
-                ).long()
-
-        return inputs_dict
-
-    def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_save_load(out1, out2):
-            # make sure we don't have nans
-            out_2 = out2.cpu().numpy()
-            out_2[np.isnan(out_2)] = 0
-
-            out_1 = out1.cpu().numpy()
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # the config file (and the generation config file, if it can generate) should be saved
-                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
-                self.assertEqual(
-                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
-                )
-
-                model = model_class.from_pretrained(tmpdirname)
-                model.to(torch_device)
-                with torch.no_grad():
-                    second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            if isinstance(first, tuple) and isinstance(second, tuple):
-                for tensor1, tensor2 in zip(first, second):
-                    check_save_load(tensor1, tensor2)
-            else:
-                check_save_load(first, second)
-
-    def test_from_pretrained_no_checkpoint(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            state_dict = model.state_dict()
-
-            new_model = model_class.from_pretrained(
-                pretrained_model_name_or_path=None, config=config, state_dict=state_dict
-            )
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(torch.equal(p1, p2))
-
-    def test_save_load_keys_to_ignore_on_save(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            _keys_to_ignore_on_save = getattr(model, "_keys_to_ignore_on_save", None)
-            if _keys_to_ignore_on_save is None:
-                continue
-
-            # check the keys are in the original state_dict
-            for k in _keys_to_ignore_on_save:
-                self.assertIn(k, model.state_dict().keys(), "\n".join(model.state_dict().keys()))
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME)
-                state_dict_saved = torch.load(output_model_file)
-                for k in _keys_to_ignore_on_save:
-                    self.assertNotIn(k, state_dict_saved.keys(), "\n".join(state_dict_saved.keys()))
-
-                # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer.
-                load_result = model.load_state_dict(state_dict_saved, strict=False)
-                self.assertTrue(
-                    len(load_result.missing_keys) == 0
-                    or set(load_result.missing_keys) == set(model._keys_to_ignore_on_save)
-                )
-                self.assertTrue(len(load_result.unexpected_keys) == 0)
-
-    def test_gradient_checkpointing_backward_compatibility(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-
-            config.gradient_checkpointing = True
-            model = model_class(config)
-            self.assertTrue(model.is_gradient_checkpointing)
-
-    def test_gradient_checkpointing_enable_disable(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-
-            # at init model should have gradient checkpointing disabled
-            model = model_class(config)
-            self.assertFalse(model.is_gradient_checkpointing)
-
-            # check enable works
-            model.gradient_checkpointing_enable()
-            self.assertTrue(model.is_gradient_checkpointing)
-
-            # check disable works
-            model.gradient_checkpointing_disable()
-            self.assertFalse(model.is_gradient_checkpointing)
-
-    def test_save_load_fast_init_from_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if config.__class__ not in MODEL_MAPPING:
-            return
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(model_class):
-                pass
-
-            model_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            model_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            model_class_copy._init_weights = _mock_init_weights
-            model_class_copy.init_weights = _mock_all_init_weights
-
-            model = base_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
-
-                model_fast_init = model_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-                # Before we test anything
-
-                for key in model_fast_init.state_dict().keys():
-                    if isinstance(model_slow_init.state_dict()[key], torch.BoolTensor):
-                        max_diff = (model_slow_init.state_dict()[key] ^ model_fast_init.state_dict()[key]).sum().item()
-                    else:
-                        max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_save_load_fast_init_to_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if config.__class__ not in MODEL_MAPPING:
-            return
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(base_class):
-                pass
-
-            base_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            base_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            base_class_copy._init_weights = _mock_init_weights
-            base_class_copy.init_weights = _mock_all_init_weights
-
-            model = model_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
-
-                model_fast_init = base_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-
-                for key in model_fast_init.state_dict().keys():
-                    if isinstance(model_slow_init.state_dict()[key], torch.BoolTensor):
-                        max_diff = torch.max(
-                            model_slow_init.state_dict()[key] ^ model_fast_init.state_dict()[key]
-                        ).item()
-                    else:
-                        max_diff = torch.max(
-                            torch.abs(model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key])
-                        ).item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_determinism(first, second):
-            out_1 = first.cpu().numpy()
-            out_2 = second.cpu().numpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            if isinstance(first, tuple) and isinstance(second, tuple):
-                for tensor1, tensor2 in zip(first, second):
-                    check_determinism(tensor1, tensor2)
-            else:
-                check_determinism(first, second)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["input_ids"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @mark.skip("Skipped for Gaudi : TODO")
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            if model_class.__name__ in [
-                *get_values(MODEL_MAPPING_NAMES),
-                *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
-            ]:
-                continue
-
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    @mark.skip("Skipped for Gaudi : TODO")
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
-            if (
-                model_class.__name__
-                in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
-                or not model_class.supports_gradient_checkpointing
-            ):
-                continue
-            model = model_class(config)
-            model.to(torch_device)
-            model.gradient_checkpointing_enable()
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model does not output attentions")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                # Question Answering model returns start_logits and end_logits
-                if model_class.__name__ in [
-                    *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
-                    *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
-                ]:
-                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    @mark.skip("Segmentation fault is observed")
-    def test_torchscript_simple(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self._create_and_check_torchscript(config, inputs_dict)
-
-    @mark.skip("Segmentation fault is observed")
-    def test_torchscript_output_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_attentions = True
-        self._create_and_check_torchscript(config, inputs_dict)
-
-    @mark.skip("Segmentation fault is observed")
-    def test_torchscript_output_hidden_state(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        self._create_and_check_torchscript(config, inputs_dict)
-
-    # This is copied from `torch/testing/_internal/jit_utils.py::clear_class_registry`
-    def clear_torch_jit_class_registry(self):
-        torch._C._jit_clear_class_registry()
-        torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
-        # torch 1.8 has no `_clear_class_state` in `torch.jit._state`
-        if hasattr(torch.jit._state, "_clear_class_state"):
-            torch.jit._state._clear_class_state()
-
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            return
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            main_input_name = model_class.main_input_name
-
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    main_input = inputs[main_input_name]
-                    attention_mask = inputs["attention_mask"]
-                    decoder_input_ids = inputs["decoder_input_ids"]
-                    decoder_attention_mask = inputs["decoder_attention_mask"]
-                    model(main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
-                    traced_model = torch.jit.trace(
-                        model, (main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
-                    )
-                elif "bbox" in inputs and "image" in inputs:  # LayoutLMv2 requires additional inputs
-                    input_ids = inputs["input_ids"]
-                    bbox = inputs["bbox"]
-                    image = inputs["image"].tensor
-                    model(input_ids, bbox, image)
-                    traced_model = torch.jit.trace(
-                        model, (input_ids, bbox, image), check_trace=False
-                    )  # when traced model is checked, an error is produced due to name mangling
-                else:
-                    main_input = inputs[main_input_name]
-                    model(main_input)
-                    traced_model = torch.jit.trace(model, main_input)
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                try:
-                    torch.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-            model.to(torch_device)
-            model.eval()
-
-            loaded_model.to(torch_device)
-            loaded_model.eval()
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
-
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                if layer_name in loaded_model_state_dict:
-                    p2 = loaded_model_state_dict[layer_name]
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-
-            self.assertTrue(models_equal)
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
-    def test_torch_fx(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self._create_and_check_torch_fx_tracing(config, inputs_dict)
-
-    def test_torch_fx_output_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True)
-
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        if not is_torch_fx_available() or not self.fx_compatible:
-            return
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    labels = inputs.get("labels", None)
-                    input_names = [
-                        "attention_mask",
-                        "decoder_attention_mask",
-                        "decoder_input_ids",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                    ]
-                    if labels is not None:
-                        input_names.append("labels")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-
-                    model_output = model(**filtered_inputs)
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                else:
-                    input_names = [
-                        "attention_mask",
-                        "bbox",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                        "pixel_values",
-                        "token_type_ids",
-                        "visual_feats",
-                        "visual_pos",
-                    ]
-
-                    labels = inputs.get("labels", None)
-                    start_positions = inputs.get("start_positions", None)
-                    end_positions = inputs.get("end_positions", None)
-                    if labels is not None:
-                        input_names.append("labels")
-                    if start_positions is not None:
-                        input_names.append("start_positions")
-                    if end_positions is not None:
-                        input_names.append("end_positions")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-
-                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
-                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
-                    ):
-                        model.config.problem_type = "single_label_classification"
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                    model_output = model(**filtered_inputs)
-
-            except Exception as e:
-                self.fail(f"Couldn't trace module: {e}")
-
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
-
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
-
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
-
-            # Test that the model can be serialized and restored properly
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
-                try:
-                    with open(pkl_file_name, "wb") as f:
-                        pickle.dump(traced_model, f)
-                    with open(pkl_file_name, "rb") as f:
-                        loaded = pickle.load(f)
-                except Exception as e:
-                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
-
-                loaded_output = loaded(**filtered_inputs)
-                loaded_output = flatten_output(loaded_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], loaded_output[i]),
-                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
-    def test_headmasking(self):
-        if not self.test_head_masking:
-            return
-
-        global_rng.seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        global_rng.seed()
-
-        inputs_dict["output_attentions"] = True
-        config.output_hidden_states = True
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-
-            # Prepare head_mask
-            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-            head_mask = torch.ones(
-                self.model_tester.num_hidden_layers,
-                self.model_tester.num_attention_heads,
-                device=torch_device,
-            )
-            head_mask[0, 0] = 0
-            head_mask[-1, :-1] = 0
-            head_mask.requires_grad_(requires_grad=True)
-            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
-            inputs["head_mask"] = head_mask
-            if model.config.is_encoder_decoder:
-                signature = inspect.signature(model.forward)
-                arg_names = [*signature.parameters.keys()]
-                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
-                    inputs["decoder_head_mask"] = head_mask
-                if "cross_attn_head_mask" in arg_names:
-                    inputs["cross_attn_head_mask"] = head_mask
-            outputs = model(**inputs, return_dict=True)
-
-            # Test that we can get a gradient back for importance score computation
-            output = sum(t.sum() for t in outputs[0])
-            output = output.sum()
-            output.backward()
-            multihead_outputs = head_mask.grad
-
-            self.assertIsNotNone(multihead_outputs)
-            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-
-            def check_attentions_validity(attentions):
-                # Remove Nan
-                for t in attentions:
-                    self.assertLess(
-                        torch.sum(torch.isnan(t)), t.numel() / 4
-                    )  # Check we don't have more than 25% nans (arbitrary)
-                attentions = [
-                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
-                ]  # remove them (the test is less complete)
-
-                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-                if len(attentions) > 2:  # encoder-decoder models have only 2 layers in each module
-                    self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
-
-            if model.config.is_encoder_decoder:
-                check_attentions_validity(outputs.encoder_attentions)
-                check_attentions_validity(outputs.decoder_attentions)
-                check_attentions_validity(outputs.cross_attentions)
-            else:
-                check_attentions_validity(outputs.attentions)
-
-    def test_head_pruning(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            model.prune_heads(heads_to_prune)
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], 1)
-            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
-            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_save_load_from_pretrained(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            model.prune_heads(heads_to_prune)
-
-            with tempfile.TemporaryDirectory() as temp_dir_name:
-                model.save_pretrained(temp_dir_name)
-                model = model_class.from_pretrained(temp_dir_name)
-                model.to(torch_device)
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-            self.assertEqual(attentions[0].shape[-3], 1)
-            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
-            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_save_load_from_config_init(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            config.pruned_heads = heads_to_prune
-
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], 1)
-            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
-            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_integration(self):
-        if not self.test_pruning:
-            return
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-
-            heads_to_prune = {1: [1, 2]}
-            config.pruned_heads = heads_to_prune
-
-            model = model_class(config=config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-
-            with tempfile.TemporaryDirectory() as temp_dir_name:
-                model.save_pretrained(temp_dir_name)
-                model = model_class.from_pretrained(temp_dir_name)
-                model.to(torch_device)
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-
-            heads_to_prune = {0: [0], 1: [1, 2]}
-            model.prune_heads(heads_to_prune)
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-
-            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2]})
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = self.has_attentions
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        model.to(torch_device)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        if config.is_encoder_decoder:
-            # Seq2Seq models
-            encoder_hidden_states = outputs.encoder_hidden_states[0]
-            encoder_hidden_states.retain_grad()
-
-            decoder_hidden_states = outputs.decoder_hidden_states[0]
-            decoder_hidden_states.retain_grad()
-
-            if self.has_attentions:
-                encoder_attentions = outputs.encoder_attentions[0]
-                encoder_attentions.retain_grad()
-
-                decoder_attentions = outputs.decoder_attentions[0]
-                decoder_attentions.retain_grad()
-
-                cross_attentions = outputs.cross_attentions[0]
-                cross_attentions.retain_grad()
-
-            output.flatten()[0].backward(retain_graph=True)
-
-            self.assertIsNotNone(encoder_hidden_states.grad)
-            self.assertIsNotNone(decoder_hidden_states.grad)
-
-            if self.has_attentions:
-                self.assertIsNotNone(encoder_attentions.grad)
-                self.assertIsNotNone(decoder_attentions.grad)
-                self.assertIsNotNone(cross_attentions.grad)
-        else:
-            # Encoder-/Decoder-only models
-            hidden_states = outputs.hidden_states[0]
-            hidden_states.retain_grad()
-
-            if self.has_attentions:
-                attentions = outputs.attentions[0]
-                attentions.retain_grad()
-
-            output.flatten()[0].backward(retain_graph=True)
-
-            self.assertIsNotNone(hidden_states.grad)
-
-            if self.has_attentions:
-                self.assertIsNotNone(attentions.grad)
-
-    def test_feed_forward_chunking(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            torch.manual_seed(0)
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            torch.manual_seed(0)
-            config.chunk_size_feed_forward = 1
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-            self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
-
-    def test_resize_position_vector_embeddings(self):
-        if not self.test_resize_position_embeddings:
-            return
-
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.to(torch_device)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            max_position_embeddings = config.max_position_embeddings
-
-            # Retrieve the embeddings and clone theme
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                encoder_cloned_embeddings = encoder_model_embed.weight.clone()
-                decoder_cloned_embeddings = decoder_model_embed.weight.clone()
-            else:
-                model_embed = model.get_position_embeddings()
-                cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the position embeddings with a larger max_position_embeddings increases
-            # the model's postion embeddings size
-            model.resize_position_embeddings(max_position_embeddings + 10)
-            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10)
-
-            # Check that it actually resizes the embeddings matrix
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] + 10)
-                self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] + 10)
-            else:
-                model_embed = model.get_position_embeddings()
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the position embeddings with a smaller max_position_embeddings decreases
-            # the model's max_position_embeddings
-            model.resize_position_embeddings(max_position_embeddings - 5)
-            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings - 5)
-
-            # Check that it actually resizes the embeddings matrix
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] - 5)
-                self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] - 5)
-            else:
-                model_embed = model.get_position_embeddings()
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-
-            if model.config.is_encoder_decoder:
-                for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-                for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-            else:
-                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-
-            self.assertTrue(models_equal)
-
-    @mark.skip("Skipped for Gaudi")
-    def test_resize_tokens_embeddings(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.to(torch_device)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-
-            # make sure that decoder_input_ids are resized as well
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.to(torch_device)
-
-            model_vocab_size = config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
-            self.assertTrue(model.config.vocab_size + 10, model_vocab_size)
-
-            model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
-
-            model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
-
-            with self.assertRaisesRegex(
-                ValueError,
-                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
-            ):
-                model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
-
-    def test_resize_embeddings_untied(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config).to(torch_device)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding, AdaptiveEmbedding))
-            model.set_input_embeddings(nn.Embedding(10, 10))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "forward"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name = list(model_signature.parameters.keys())[1]
-            self.assertEqual(model_class.main_input_name, observed_main_input_name)
-
-    def test_correct_missing_keys(self):
-        if not self.test_missing_keys:
-            return
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            base_model_prefix = model.base_model_prefix
-
-            if hasattr(model, base_model_prefix):
-                extra_params = {k: v for k, v in model.named_parameters() if not k.startswith(base_model_prefix)}
-                extra_params.update({k: v for k, v in model.named_buffers() if not k.startswith(base_model_prefix)})
-                # Some models define this as None
-                if model._keys_to_ignore_on_load_missing:
-                    for key in model._keys_to_ignore_on_load_missing:
-                        extra_params.pop(key, None)
-
-                if not extra_params:
-                    # In that case, we *are* on a head model, but every
-                    # single key is not actual parameters and this is
-                    # tested in `test_tied_model_weights_key_ignore` test.
-                    continue
-
-                with tempfile.TemporaryDirectory() as temp_dir_name:
-                    model.base_model.save_pretrained(temp_dir_name)
-                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
-                    self.assertGreater(len(loading_info["missing_keys"]), 0, model.__class__.__name__)
-
-    def test_tie_model_weights(self):
-        if not self.test_torchscript:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_same_values(layer_1, layer_2):
-            equal = True
-            for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    equal = False
-            return equal
-
-        for model_class in self.all_model_classes:
-            config.torchscript = True
-            model_not_tied = model_class(config)
-            if model_not_tied.get_output_embeddings() is None:
-                continue
-
-            config_tied = copy.deepcopy(config)
-            config_tied.torchscript = False
-            model_tied = model_class(config_tied)
-            params_tied = list(model_tied.parameters())
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # # Check that after modification, they remain the same.
-            # embeddings.weight.data.div_(2)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # # Check that after modification, they remain the same.
-            # decoding.weight.data.div_(4)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after resize they remain tied.
-            model_tied.resize_token_embeddings(config.vocab_size + 10)
-            params_tied_2 = list(model_tied.parameters())
-            self.assertEqual(len(params_tied_2), len(params_tied))
-
-            # decoding.weight.data.mul_(20)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
-            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
-
-    @require_safetensors
-    def test_can_use_safetensors(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model_tied = model_class(config)
-            with tempfile.TemporaryDirectory() as d:
-                try:
-                    model_tied.save_pretrained(d, safe_serialization=True)
-                except Exception as e:
-                    raise Exception(f"Class {model_class.__name__} cannot be saved using safetensors: {e}")
-
-                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
-                # Checking the state dicts are correct
-                reloaded_state = model_reloaded.state_dict()
-                for k, v in model_tied.state_dict().items():
-                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
-                    torch.testing.assert_close(
-                        v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
-                    )
-                # Checking there was no complain of missing weights
-                self.assertEqual(infos["missing_keys"], [])
-
-                # Checking the tensor sharing are correct
-                ptrs = defaultdict(list)
-                for k, v in model_tied.state_dict().items():
-                    ptrs[v.data_ptr()].append(k)
-
-                shared_ptrs = {k: v for k, v in ptrs.items() if len(v) > 1}
-
-                for _, shared_names in shared_ptrs.items():
-                    reloaded_ptrs = {reloaded_state[k].data_ptr() for k in shared_names}
-                    self.assertEqual(
-                        len(reloaded_ptrs),
-                        1,
-                        f"The shared pointers are incorrect, found different pointers for keys {shared_names}",
-                    )
-
-    def test_load_save_without_tied_weights(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.tie_word_embeddings = False
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as d:
-                model.save_pretrained(d)
-
-                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
-                # Checking the state dicts are correct
-                reloaded_state = model_reloaded.state_dict()
-                for k, v in model.state_dict().items():
-                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
-                    torch.testing.assert_close(
-                        v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
-                    )
-                # Checking there was no complain of missing weights
-                self.assertEqual(infos["missing_keys"], [])
-
-    def test_tied_weights_keys(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.tie_word_embeddings = True
-        for model_class in self.all_model_classes:
-            model_tied = model_class(config)
-
-            ptrs = collections.defaultdict(list)
-            for name, tensor in model_tied.state_dict().items():
-                ptrs[id_tensor_storage(tensor)].append(name)
-
-            # These are all the pointers of shared tensors.
-            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
-
-            tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else []
-            # Detect we get a hit for each key
-            for key in tied_weight_keys:
-                if not any(re.search(key, p) for group in tied_params for p in group):
-                    raise ValueError(f"{key} is not a tied weight key for {model_class}.")
-
-            # Removed tied weights found from tied params -> there should only be one left after
-            for key in tied_weight_keys:
-                for i in range(len(tied_params)):
-                    tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
-
-            tied_params = [group for group in tied_params if len(group) > 1]
-            self.assertListEqual(
-                tied_params,
-                [],
-                f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
-            )
-
-    def test_model_weights_reload_no_missing_tied_weights(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.save_pretrained(tmp_dir)
-
-                # We are nuking ALL weights on file, so every parameter should
-                # yell on load. We're going to detect if we yell too much, or too little.
-                placeholder_dict = {"tensor": torch.tensor([1, 2])}
-                safe_save_file(placeholder_dict, os.path.join(tmp_dir, "model.safetensors"), metadata={"format": "pt"})
-                model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True)
-
-                prefix = f"{model_reloaded.base_model_prefix}."
-                params = dict(model_reloaded.named_parameters())
-                params.update(dict(model_reloaded.named_buffers()))
-                param_names = {k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()}
-
-                missing_keys = set(infos["missing_keys"])
-
-                extra_missing = missing_keys - param_names
-                # Remove tied weights from extra missing: they are normally not warned as missing if their tied
-                # counterpart is present but here there are no weights at all so we do get the warning.
-                ptrs = collections.defaultdict(list)
-                for name, tensor in model_reloaded.state_dict().items():
-                    ptrs[id_tensor_storage(tensor)].append(name)
-                tied_params = [names for _, names in ptrs.items() if len(names) > 1]
-                for group in tied_params:
-                    group = {k[len(prefix) :] if k.startswith(prefix) else k for k in group}
-                    # We remove the group from extra_missing if not all weights from group are in it
-                    if len(group - extra_missing) > 0:
-                        extra_missing = extra_missing - set(group)
-
-                self.assertEqual(
-                    extra_missing,
-                    set(),
-                    f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}. "
-                    f"For debugging, tied parameters are {tied_params}",
-                )
-
-                missed_missing = param_names - missing_keys
-                # Remove nonpersistent buffers from missed_missing
-                buffers = [n for n, _ in model_reloaded.named_buffers()]
-                nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()}
-                nonpersistent_buffers = {
-                    k[len(prefix) :] if k.startswith(prefix) else k for k in nonpersistent_buffers
-                }
-                missed_missing = missed_missing - nonpersistent_buffers
-
-                if model_reloaded._keys_to_ignore_on_load_missing is None:
-                    expected_missing = set()
-                else:
-                    expected_missing = set(model_reloaded._keys_to_ignore_on_load_missing)
-                self.assertEqual(
-                    missed_missing,
-                    expected_missing,
-                    f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real"
-                    " parameters. If they are non persistent buffers make sure to instantiate them with"
-                    " `persistent=False`",
-                )
-
-    @mark.skip("skip - test is slow")
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            with torch.no_grad():
-                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-                def recursive_check(tuple_object, dict_object):
-                    if isinstance(tuple_object, (List, Tuple)):
-                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif isinstance(tuple_object, Dict):
-                        for tuple_iterable_value, dict_iterable_value in zip(
-                            tuple_object.values(), dict_object.values()
-                        ):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif tuple_object is None:
-                        return
-                    else:
-                        self.assertTrue(
-                            torch.allclose(
-                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                            ),
-                            msg=(
-                                "Tuple and dict output are not equal. Difference:"
-                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
-                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
-                            ),
-                        )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            if self.has_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(
-                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-                )
-
-    # Don't copy this method to model specific test file!
-    # TODO: remove this method once the issues are all fixed!
-    def _make_attention_mask_non_null(self, inputs_dict):
-        """Make sure no sequence has all zeros as attention mask"""
-
-        for k in ["attention_mask", "encoder_attention_mask", "decoder_attention_mask"]:
-            if k in inputs_dict:
-                attention_mask = inputs_dict[k]
-
-                # Make sure no all 0s attention masks - to avoid failure at this moment.
-                # Put `1` at the beginning of sequences to make it still work when combining causal attention masks.
-                # TODO: remove this line once a fix regarding large negative values for attention mask is done.
-                attention_mask = torch.cat(
-                    [torch.ones_like(attention_mask[:, :1], dtype=attention_mask.dtype), attention_mask[:, 1:]], dim=-1
-                )
-
-                # Here we make the first sequence with all 0s as attention mask.
-                # Currently, this will fail for `TFWav2Vec2Model`. This is caused by the different large negative
-                # values, like `1e-4`, `1e-9`, `1e-30` and `-inf` for attention mask across models/frameworks.
-                # TODO: enable this block once the large negative values thing is cleaned up.
-                # (see https://github.com/huggingface/transformers/issues/14859)
-                # attention_mask = torch.cat(
-                #     [torch.zeros_like(attention_mask[:1], dtype=attention_mask.dtype), attention_mask[1:]],
-                #     dim=0
-                # )
-
-                inputs_dict[k] = attention_mask
-
-    # Don't copy this method to model specific test file!
-    # TODO: remove this method once the issues are all fixed!
-    def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_class):
-        """For temporarily ignoring some failed test cases (issues to be fixed)"""
-
-        tf_keys = {k for k, v in tf_outputs.items() if v is not None}
-        pt_keys = {k for k, v in pt_outputs.items() if v is not None}
-
-        key_differences = tf_keys.symmetric_difference(pt_keys)
-
-        if model_class.__name__ in [
-            "FlaubertWithLMHeadModel",
-            "FunnelForPreTraining",
-            "ElectraForPreTraining",
-            "XLMWithLMHeadModel",
-            "TransfoXLLMHeadModel",
-        ]:
-            for k in key_differences:
-                if k in ["loss", "losses"]:
-                    tf_keys.discard(k)
-                    pt_keys.discard(k)
-        elif model_class.__name__.startswith("GPT2"):
-            # `TFGPT2` has `past_key_values` as a tensor while `GPT2` has it as a tuple.
-            tf_keys.discard("past_key_values")
-            pt_keys.discard("past_key_values")
-
-        # create new outputs from the remaining fields
-        new_tf_outputs = type(tf_outputs)(**{k: tf_outputs[k] for k in tf_keys})
-        new_pt_outputs = type(pt_outputs)(**{k: pt_outputs[k] for k in pt_keys})
-
-        return new_tf_outputs, new_pt_outputs
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
-    @require_torch_multi_gpu
-    def test_multi_gpu_data_parallel_forward(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # some params shouldn't be scattered by nn.DataParallel
-        # so just remove them if they are present.
-        blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
-        for k in blacklist_non_batched_params:
-            inputs_dict.pop(k, None)
-
-        # move input tensors to cuda:O
-        for k, v in inputs_dict.items():
-            if torch.is_tensor(v):
-                inputs_dict[k] = v.to(0)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            model.to(0)
-            model.eval()
-
-            # Wrap model in nn.DataParallel
-            model = nn.DataParallel(model)
-            with torch.no_grad():
-                _ = model(**self._prepare_for_class(inputs_dict, model_class))
-
-    @require_torch_multi_gpu
-    def test_model_parallelization(self):
-        if not self.test_model_parallel:
-            return
-
-        # a candidate for testing_utils
-        def get_current_gpu_memory_use():
-            """returns a list of cuda memory allocations per GPU in MBs"""
-
-            per_device_memory = []
-            for id in range(torch.cuda.device_count()):
-                with torch.cuda.device(id):
-                    per_device_memory.append(torch.cuda.memory_allocated() >> 20)
-
-            return per_device_memory
-
-        # Needs a large model to see the difference.
-        config = self.model_tester.get_large_model_config()
-
-        for model_class in self.all_parallelizable_model_classes:
-            torch.cuda.empty_cache()
-
-            # 1. single gpu memory load + unload + memory measurements
-            # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests)
-            memory_at_start = get_current_gpu_memory_use()
-
-            # Put model on device 0 and take a memory snapshot
-            model = model_class(config)
-            model.to("cuda:0")
-            memory_after_model_load = get_current_gpu_memory_use()
-
-            # The memory use on device 0 should be higher than it was initially.
-            self.assertGreater(memory_after_model_load[0], memory_at_start[0])
-
-            del model
-            gc.collect()
-            torch.cuda.empty_cache()
-
-            # 2. MP test
-            # it's essential to re-calibrate the usage before the next stage
-            memory_at_start = get_current_gpu_memory_use()
-
-            # Spread model layers over multiple devices
-            model = model_class(config)
-            model.parallelize()
-            memory_after_parallelization = get_current_gpu_memory_use()
-
-            # Assert that the memory use on all devices is higher than it was when loaded only on CPU
-            for n in range(len(model.device_map.keys())):
-                self.assertGreater(memory_after_parallelization[n], memory_at_start[n])
-
-            # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it
-            self.assertLess(memory_after_parallelization[0], memory_after_model_load[0])
-
-            # Assert that the memory use of device 1 is higher than it was when the entire model was loaded
-            # on device 0 and device 1 wasn't used at all
-            self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1])
-
-            del model
-            gc.collect()
-            torch.cuda.empty_cache()
-
-    @require_torch_multi_gpu
-    def test_model_parallel_equal_results(self):
-        if not self.test_model_parallel:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_parallelizable_model_classes:
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-
-            def cast_to_device(dictionary, device):
-                output = {}
-                for k, v in dictionary.items():
-                    if isinstance(v, torch.Tensor):
-                        output[k] = v.to(device)
-                    else:
-                        output[k] = v
-
-                return output
-
-            model = model_class(config)
-            output = model(**cast_to_device(inputs_dict, "cpu"))
-
-            model.parallelize()
-
-            parallel_output = model(**cast_to_device(inputs_dict, "cuda:0"))
-
-            for value, parallel_value in zip(output, parallel_output):
-                if isinstance(value, torch.Tensor):
-                    self.assertTrue(torch.allclose(value, parallel_value.to("cpu"), atol=1e-7))
-                elif isinstance(value, (Tuple, List)):
-                    for value_, parallel_value_ in zip(value, parallel_value):
-                        self.assertTrue(torch.allclose(value_, parallel_value_.to("cpu"), atol=1e-7))
-
-    @require_torch_multi_gpu
-    def test_model_parallel_beam_search(self):
-        if not self.test_model_parallel:
-            return
-
-        all_generative_and_parallelizable_model_classes = tuple(
-            set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes)
-        )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in all_generative_and_parallelizable_model_classes:
-            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-
-            def cast_to_device(dictionary, device):
-                output = {}
-                for k, v in dictionary.items():
-                    if isinstance(v, torch.Tensor):
-                        output[k] = v.to(device)
-                    else:
-                        output[k] = v
-
-                return output
-
-            model.parallelize()
-            model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2)
-
-    def check_device_map_is_respected(self, model, device_map):
-        for param_name, param in model.named_parameters():
-            # Find device in device_map
-            while len(param_name) > 0 and param_name not in device_map:
-                param_name = ".".join(param_name.split(".")[:-1])
-            if param_name not in device_map:
-                raise ValueError("device map is incomplete, it does not contain any device for `param_name`.")
-
-            param_device = device_map[param_name]
-            if param_device in ["cpu", "disk"]:
-                self.assertEqual(param.device, torch.device("meta"))
-            else:
-                self.assertEqual(param.device, torch.device(param_device))
-
-    @require_accelerate
-    @mark.accelerate_tests
-    @require_torch_gpu
-    def test_disk_offload(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class._no_split_modules is None:
-                continue
-
-            inputs_dict_class = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config).eval()
-            model = model.to(torch_device)
-            torch.manual_seed(0)
-            base_output = model(**inputs_dict_class)
-
-            model_size = compute_module_sizes(model)[""]
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.cpu().save_pretrained(tmp_dir)
-
-                with self.assertRaises(ValueError):
-                    max_size = int(self.model_split_percents[0] * model_size)
-                    max_memory = {0: max_size, "cpu": max_size}
-                    # This errors out cause it's missing an offload folder
-                    new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-
-                max_size = int(self.model_split_percents[1] * model_size)
-                max_memory = {0: max_size, "cpu": max_size}
-                new_model = model_class.from_pretrained(
-                    tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
-                )
-
-                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-                torch.manual_seed(0)
-                new_output = new_model(**inputs_dict_class)
-
-                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
-
-    @require_accelerate
-    @mark.accelerate_tests
-    @require_torch_gpu
-    def test_cpu_offload(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class._no_split_modules is None:
-                continue
-
-            inputs_dict_class = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config).eval()
-            model = model.to(torch_device)
-
-            torch.manual_seed(0)
-            base_output = model(**inputs_dict_class)
-
-            model_size = compute_module_sizes(model)[""]
-            # We test several splits of sizes to make sure it works.
-            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.cpu().save_pretrained(tmp_dir)
-
-                for max_size in max_gpu_sizes:
-                    max_memory = {0: max_size, "cpu": model_size * 2}
-                    new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-                    # Making sure part of the model will actually end up offloaded
-                    self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"})
-
-                    self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-
-                    torch.manual_seed(0)
-                    new_output = new_model(**inputs_dict_class)
-
-                    self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
-
-    @require_accelerate
-    @mark.accelerate_tests
-    @require_torch_multi_gpu
-    def test_model_parallelism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class._no_split_modules is None:
-                continue
-
-            inputs_dict_class = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config).eval()
-            model = model.to(torch_device)
-
-            torch.manual_seed(0)
-            base_output = model(**inputs_dict_class)
-
-            model_size = compute_module_sizes(model)[""]
-            # We test several splits of sizes to make sure it works.
-            max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.cpu().save_pretrained(tmp_dir)
-
-                for max_size in max_gpu_sizes:
-                    max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
-                    new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-                    # Making sure part of the model will actually end up offloaded
-                    self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1})
-
-                    self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-
-                    torch.manual_seed(0)
-                    new_output = new_model(**inputs_dict_class)
-
-                    self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
-
-    def test_problem_types(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        problem_types = [
-            {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float},
-            {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long},
-            {"title": "regression", "num_labels": 1, "dtype": torch.float},
-        ]
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ not in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
-            ]:
-                continue
-
-            for problem_type in problem_types:
-                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-                    config.problem_type = problem_type["title"]
-                    config.num_labels = problem_type["num_labels"]
-
-                    model = model_class(config)
-                    model.to(torch_device)
-                    model.train()
-
-                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-                    if problem_type["num_labels"] > 1:
-                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
-
-                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
-
-                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
-                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
-                    # they have the same size." which is a symptom something in wrong for the regression problem.
-                    # See https://github.com/huggingface/transformers/issues/11780
-                    with warnings.catch_warnings(record=True) as warning_list:
-                        loss = model(**inputs).loss
-                    for w in warning_list:
-                        if "Using a target size that is different to the input size" in str(w.message):
-                            raise ValueError(
-                                f"Something is going wrong in the regression problem: intercepted {w.message}"
-                            )
-
-                    loss.backward()
-
-    def test_load_with_mismatched_shapes(self):
-        if not self.test_mismatched_shapes:
-            return
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES):
-                continue
-
-            with self.subTest(msg=f"Testing {model_class}"):
-                with tempfile.TemporaryDirectory() as tmp_dir:
-                    model = model_class(config)
-                    model.save_pretrained(tmp_dir)
-
-                    # Fails when we don't set ignore_mismatched_sizes=True
-                    with self.assertRaises(RuntimeError):
-                        new_model = AutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
-                    with self.assertRaises(RuntimeError):
-                        new_model_without_prefix = AutoModel.from_pretrained(tmp_dir, vocab_size=10)
-
-                    logger = logging.get_logger("transformers.modeling_utils")
-
-                    with CaptureLogger(logger) as cl:
-                        new_model = AutoModelForSequenceClassification.from_pretrained(
-                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-                    new_model.to(torch_device)
-                    inputs = self._prepare_for_class(inputs_dict, model_class)
-                    logits = new_model(**inputs).logits
-                    self.assertEqual(logits.shape[1], 42)
-
-                    with CaptureLogger(logger) as cl:
-                        new_model_without_prefix = AutoModel.from_pretrained(
-                            tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-                    input_ids = ids_tensor((2, 8), 10)
-                    new_model_without_prefix.to(torch_device)
-                    if self.is_encoder_decoder:
-                        new_model_without_prefix(input_ids, decoder_input_ids=input_ids)
-                    else:
-                        new_model_without_prefix(input_ids)
-
-    def test_model_is_small(self):
-        # Just a consistency check to make sure we are not running tests on 80M parameter models.
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            num_params = model.num_parameters()
-            assert (
-                num_params < 1000000
-            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
-
-
-global_rng = random.Random()
-
-
-def ids_tensor(shape, vocab_size, rng=None, name=None):
-    #  Creates a random int32 tensor of the shape within the vocab size
-    if rng is None:
-        rng = global_rng
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
-
-
-def random_attention_mask(shape, rng=None, name=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None)
-    # make sure that at least one token is attended to for each batch
-    attn_mask[:, -1] = 1
-    return attn_mask
-
-
-def floats_tensor(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.random() * scale)
-
-    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
diff --git a/tests/transformers/tests/utils/__init__.py b/tests/transformers/tests/utils/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/tests/utils/test_activations.py b/tests/transformers/tests/utils/test_activations.py
deleted file mode 100644
index 5e58205d09..0000000000
--- a/tests/transformers/tests/utils/test_activations.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch
-
-
-if is_torch_available():
-    import torch
-    from transformers.activations import gelu_new, gelu_python, get_activation
-
-
-@require_torch
-class TestActivations(unittest.TestCase):
-    def test_gelu_versions(self):
-        x = torch.tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100])
-        torch_builtin = get_activation("gelu")
-        self.assertTrue(torch.allclose(gelu_python(x), torch_builtin(x)))
-        self.assertFalse(torch.allclose(gelu_python(x), gelu_new(x)))
-
-    def test_gelu_10(self):
-        x = torch.tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100])
-        torch_builtin = get_activation("gelu")
-        gelu10 = get_activation("gelu_10")
-
-        y_gelu = torch_builtin(x)
-        y_gelu_10 = gelu10(x)
-
-        clipped_mask = torch.where(y_gelu_10 < 10.0, 1, 0)
-
-        self.assertTrue(torch.max(y_gelu_10).item() == 10.0)
-        self.assertTrue(torch.allclose(y_gelu * clipped_mask, y_gelu_10 * clipped_mask))
-
-    def test_get_activation(self):
-        get_activation("gelu")
-        get_activation("gelu_10")
-        get_activation("gelu_fast")
-        get_activation("gelu_new")
-        get_activation("gelu_python")
-        get_activation("gelu_pytorch_tanh")
-        get_activation("linear")
-        get_activation("mish")
-        get_activation("quick_gelu")
-        get_activation("relu")
-        get_activation("sigmoid")
-        get_activation("silu")
-        get_activation("swish")
-        get_activation("tanh")
-        with self.assertRaises(KeyError):
-            get_activation("bogus")
-        with self.assertRaises(KeyError):
-            get_activation(None)
-
-    def test_activations_are_distinct_objects(self):
-        act1 = get_activation("gelu")
-        act1.a = 1
-        act2 = get_activation("gelu")
-        self.assertEqual(act1.a, 1)
-        with self.assertRaises(AttributeError):
-            _ = act2.a
diff --git a/tests/transformers/tests/utils/test_activations_tf.py b/tests/transformers/tests/utils/test_activations_tf.py
deleted file mode 100644
index a8533ae362..0000000000
--- a/tests/transformers/tests/utils/test_activations_tf.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from transformers import is_tf_available
-from transformers.testing_utils import require_tf
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers.activations_tf import get_tf_activation
-
-
-@require_tf
-class TestTFActivations(unittest.TestCase):
-    def test_gelu_10(self):
-        x = tf.constant([-100, -1.0, -0.1, 0, 0.1, 1.0, 100.0])
-        gelu = get_tf_activation("gelu")
-        gelu10 = get_tf_activation("gelu_10")
-
-        y_gelu = gelu(x)
-        y_gelu_10 = gelu10(x)
-
-        clipped_mask = tf.where(y_gelu_10 < 10.0, 1.0, 0.0)
-
-        self.assertEqual(tf.math.reduce_max(y_gelu_10).numpy().item(), 10.0)
-        self.assertTrue(np.allclose(y_gelu * clipped_mask, y_gelu_10 * clipped_mask))
-
-    def test_get_activation(self):
-        get_tf_activation("gelu")
-        get_tf_activation("gelu_10")
-        get_tf_activation("gelu_fast")
-        get_tf_activation("gelu_new")
-        get_tf_activation("glu")
-        get_tf_activation("mish")
-        get_tf_activation("quick_gelu")
-        get_tf_activation("relu")
-        get_tf_activation("sigmoid")
-        get_tf_activation("silu")
-        get_tf_activation("swish")
-        get_tf_activation("tanh")
-        with self.assertRaises(KeyError):
-            get_tf_activation("bogus")
-        with self.assertRaises(KeyError):
-            get_tf_activation(None)
diff --git a/tests/transformers/tests/utils/test_add_new_model_like.py b/tests/transformers/tests/utils/test_add_new_model_like.py
deleted file mode 100644
index 61ccc184f5..0000000000
--- a/tests/transformers/tests/utils/test_add_new_model_like.py
+++ /dev/null
@@ -1,1548 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import re
-import tempfile
-import unittest
-from pathlib import Path
-
-import transformers
-from transformers.commands.add_new_model_like import (
-    ModelPatterns,
-    _re_class_func,
-    add_content_to_file,
-    add_content_to_text,
-    clean_frameworks_in_init,
-    duplicate_doc_file,
-    duplicate_module,
-    filter_framework_files,
-    find_base_model_checkpoint,
-    get_model_files,
-    get_module_from_file,
-    parse_module_content,
-    replace_model_patterns,
-    retrieve_info_for_model,
-    retrieve_model_classes,
-    simplify_replacements,
-)
-from transformers.testing_utils import require_flax, require_tf, require_torch
-
-
-BERT_MODEL_FILES = {
-    "src/transformers/models/bert/__init__.py",
-    "src/transformers/models/bert/configuration_bert.py",
-    "src/transformers/models/bert/tokenization_bert.py",
-    "src/transformers/models/bert/tokenization_bert_fast.py",
-    "src/transformers/models/bert/tokenization_bert_tf.py",
-    "src/transformers/models/bert/modeling_bert.py",
-    "src/transformers/models/bert/modeling_flax_bert.py",
-    "src/transformers/models/bert/modeling_tf_bert.py",
-    "src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py",
-    "src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py",
-    "src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py",
-    "src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py",
-}
-
-VIT_MODEL_FILES = {
-    "src/transformers/models/vit/__init__.py",
-    "src/transformers/models/vit/configuration_vit.py",
-    "src/transformers/models/vit/convert_dino_to_pytorch.py",
-    "src/transformers/models/vit/convert_vit_timm_to_pytorch.py",
-    "src/transformers/models/vit/feature_extraction_vit.py",
-    "src/transformers/models/vit/image_processing_vit.py",
-    "src/transformers/models/vit/modeling_vit.py",
-    "src/transformers/models/vit/modeling_tf_vit.py",
-    "src/transformers/models/vit/modeling_flax_vit.py",
-}
-
-WAV2VEC2_MODEL_FILES = {
-    "src/transformers/models/wav2vec2/__init__.py",
-    "src/transformers/models/wav2vec2/configuration_wav2vec2.py",
-    "src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py",
-    "src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py",
-    "src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py",
-    "src/transformers/models/wav2vec2/modeling_wav2vec2.py",
-    "src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py",
-    "src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py",
-    "src/transformers/models/wav2vec2/processing_wav2vec2.py",
-    "src/transformers/models/wav2vec2/tokenization_wav2vec2.py",
-}
-
-REPO_PATH = Path(transformers.__path__[0]).parent.parent
-
-
-@require_torch
-@require_tf
-@require_flax
-class TestAddNewModelLike(unittest.TestCase):
-    def init_file(self, file_name, content):
-        with open(file_name, "w", encoding="utf-8") as f:
-            f.write(content)
-
-    def check_result(self, file_name, expected_result):
-        with open(file_name, "r", encoding="utf-8") as f:
-            result = f.read()
-            self.assertEqual(result, expected_result)
-
-    def test_re_class_func(self):
-        self.assertEqual(_re_class_func.search("def my_function(x, y):").groups()[0], "my_function")
-        self.assertEqual(_re_class_func.search("class MyClass:").groups()[0], "MyClass")
-        self.assertEqual(_re_class_func.search("class MyClass(SuperClass):").groups()[0], "MyClass")
-
-    def test_model_patterns_defaults(self):
-        model_patterns = ModelPatterns("GPT-New new", "huggingface/gpt-new-base")
-
-        self.assertEqual(model_patterns.model_type, "gpt-new-new")
-        self.assertEqual(model_patterns.model_lower_cased, "gpt_new_new")
-        self.assertEqual(model_patterns.model_camel_cased, "GPTNewNew")
-        self.assertEqual(model_patterns.model_upper_cased, "GPT_NEW_NEW")
-        self.assertEqual(model_patterns.config_class, "GPTNewNewConfig")
-        self.assertIsNone(model_patterns.tokenizer_class)
-        self.assertIsNone(model_patterns.feature_extractor_class)
-        self.assertIsNone(model_patterns.processor_class)
-
-    def test_parse_module_content(self):
-        test_code = """SOME_CONSTANT = a constant
-
-CONSTANT_DEFINED_ON_SEVERAL_LINES = [
-    first_item,
-    second_item
-]
-
-def function(args):
-    some code
-
-# Copied from transformers.some_module
-class SomeClass:
-    some code
-"""
-
-        expected_parts = [
-            "SOME_CONSTANT = a constant\n",
-            "CONSTANT_DEFINED_ON_SEVERAL_LINES = [\n    first_item,\n    second_item\n]",
-            "",
-            "def function(args):\n    some code\n",
-            "# Copied from transformers.some_module\nclass SomeClass:\n    some code\n",
-        ]
-        self.assertEqual(parse_module_content(test_code), expected_parts)
-
-    def test_add_content_to_text(self):
-        test_text = """all_configs = {
-    "gpt": "GPTConfig",
-    "bert": "BertConfig",
-    "t5": "T5Config",
-}"""
-
-        expected = """all_configs = {
-    "gpt": "GPTConfig",
-    "gpt2": "GPT2Config",
-    "bert": "BertConfig",
-    "t5": "T5Config",
-}"""
-        line = '    "gpt2": "GPT2Config",'
-
-        self.assertEqual(add_content_to_text(test_text, line, add_before="bert"), expected)
-        self.assertEqual(add_content_to_text(test_text, line, add_before="bert", exact_match=True), test_text)
-        self.assertEqual(
-            add_content_to_text(test_text, line, add_before='    "bert": "BertConfig",', exact_match=True), expected
-        )
-        self.assertEqual(add_content_to_text(test_text, line, add_before=re.compile(r'^\s*"bert":')), expected)
-
-        self.assertEqual(add_content_to_text(test_text, line, add_after="gpt"), expected)
-        self.assertEqual(add_content_to_text(test_text, line, add_after="gpt", exact_match=True), test_text)
-        self.assertEqual(
-            add_content_to_text(test_text, line, add_after='    "gpt": "GPTConfig",', exact_match=True), expected
-        )
-        self.assertEqual(add_content_to_text(test_text, line, add_after=re.compile(r'^\s*"gpt":')), expected)
-
-    def test_add_content_to_file(self):
-        test_text = """all_configs = {
-    "gpt": "GPTConfig",
-    "bert": "BertConfig",
-    "t5": "T5Config",
-}"""
-
-        expected = """all_configs = {
-    "gpt": "GPTConfig",
-    "gpt2": "GPT2Config",
-    "bert": "BertConfig",
-    "t5": "T5Config",
-}"""
-        line = '    "gpt2": "GPT2Config",'
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            file_name = os.path.join(tmp_dir, "code.py")
-
-            self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_before="bert")
-            self.check_result(file_name, expected)
-
-            self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_before="bert", exact_match=True)
-            self.check_result(file_name, test_text)
-
-            self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_before='    "bert": "BertConfig",', exact_match=True)
-            self.check_result(file_name, expected)
-
-            self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_before=re.compile(r'^\s*"bert":'))
-            self.check_result(file_name, expected)
-
-            self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_after="gpt")
-            self.check_result(file_name, expected)
-
-            self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_after="gpt", exact_match=True)
-            self.check_result(file_name, test_text)
-
-            self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_after='    "gpt": "GPTConfig",', exact_match=True)
-            self.check_result(file_name, expected)
-
-            self.init_file(file_name, test_text)
-            add_content_to_file(file_name, line, add_after=re.compile(r'^\s*"gpt":'))
-            self.check_result(file_name, expected)
-
-    def test_simplify_replacements(self):
-        self.assertEqual(simplify_replacements([("Bert", "NewBert")]), [("Bert", "NewBert")])
-        self.assertEqual(
-            simplify_replacements([("Bert", "NewBert"), ("bert", "new-bert")]),
-            [("Bert", "NewBert"), ("bert", "new-bert")],
-        )
-        self.assertEqual(
-            simplify_replacements([("BertConfig", "NewBertConfig"), ("Bert", "NewBert"), ("bert", "new-bert")]),
-            [("Bert", "NewBert"), ("bert", "new-bert")],
-        )
-
-    def test_replace_model_patterns(self):
-        bert_model_patterns = ModelPatterns("Bert", "bert-base-cased")
-        new_bert_model_patterns = ModelPatterns("New Bert", "huggingface/bert-new-base")
-        bert_test = '''class TFBertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = BertConfig
-    load_tf_weights = load_tf_weights_in_bert
-    base_model_prefix = "bert"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-    model_type = "bert"
-
-BERT_CONSTANT = "value"
-'''
-        bert_expected = '''class TFNewBertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = NewBertConfig
-    load_tf_weights = load_tf_weights_in_new_bert
-    base_model_prefix = "new_bert"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-    model_type = "new-bert"
-
-NEW_BERT_CONSTANT = "value"
-'''
-
-        bert_converted, replacements = replace_model_patterns(bert_test, bert_model_patterns, new_bert_model_patterns)
-        self.assertEqual(bert_converted, bert_expected)
-        # Replacements are empty here since bert as been replaced by bert_new in some instances and bert-new
-        # in others.
-        self.assertEqual(replacements, "")
-
-        # If we remove the model type, we will get replacements
-        bert_test = bert_test.replace('    model_type = "bert"\n', "")
-        bert_expected = bert_expected.replace('    model_type = "new-bert"\n', "")
-        bert_converted, replacements = replace_model_patterns(bert_test, bert_model_patterns, new_bert_model_patterns)
-        self.assertEqual(bert_converted, bert_expected)
-        self.assertEqual(replacements, "BERT->NEW_BERT,Bert->NewBert,bert->new_bert")
-
-        gpt_model_patterns = ModelPatterns("GPT2", "gpt2")
-        new_gpt_model_patterns = ModelPatterns("GPT-New new", "huggingface/gpt-new-base")
-        gpt_test = '''class GPT2PreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = GPT2Config
-    load_tf_weights = load_tf_weights_in_gpt2
-    base_model_prefix = "transformer"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-
-GPT2_CONSTANT = "value"
-'''
-
-        gpt_expected = '''class GPTNewNewPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = GPTNewNewConfig
-    load_tf_weights = load_tf_weights_in_gpt_new_new
-    base_model_prefix = "transformer"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-
-GPT_NEW_NEW_CONSTANT = "value"
-'''
-
-        gpt_converted, replacements = replace_model_patterns(gpt_test, gpt_model_patterns, new_gpt_model_patterns)
-        self.assertEqual(gpt_converted, gpt_expected)
-        # Replacements are empty here since GPT2 as been replaced by GPTNewNew in some instances and GPT_NEW_NEW
-        # in others.
-        self.assertEqual(replacements, "")
-
-        roberta_model_patterns = ModelPatterns("RoBERTa", "roberta-base", model_camel_cased="Roberta")
-        new_roberta_model_patterns = ModelPatterns(
-            "RoBERTa-New", "huggingface/roberta-new-base", model_camel_cased="RobertaNew"
-        )
-        roberta_test = '''# Copied from transformers.models.bert.BertModel with Bert->Roberta
-class RobertaModel(RobertaPreTrainedModel):
-    """ The base RoBERTa model. """
-    checkpoint = roberta-base
-    base_model_prefix = "roberta"
-        '''
-        roberta_expected = '''# Copied from transformers.models.bert.BertModel with Bert->RobertaNew
-class RobertaNewModel(RobertaNewPreTrainedModel):
-    """ The base RoBERTa-New model. """
-    checkpoint = huggingface/roberta-new-base
-    base_model_prefix = "roberta_new"
-        '''
-        roberta_converted, replacements = replace_model_patterns(
-            roberta_test, roberta_model_patterns, new_roberta_model_patterns
-        )
-        self.assertEqual(roberta_converted, roberta_expected)
-
-    def test_get_module_from_file(self):
-        self.assertEqual(
-            get_module_from_file("/git/transformers/src/transformers/models/bert/modeling_tf_bert.py"),
-            "transformers.models.bert.modeling_tf_bert",
-        )
-        self.assertEqual(
-            get_module_from_file("/transformers/models/gpt2/modeling_gpt2.py"),
-            "transformers.models.gpt2.modeling_gpt2",
-        )
-        with self.assertRaises(ValueError):
-            get_module_from_file("/models/gpt2/modeling_gpt2.py")
-
-    def test_duplicate_module(self):
-        bert_model_patterns = ModelPatterns("Bert", "bert-base-cased")
-        new_bert_model_patterns = ModelPatterns("New Bert", "huggingface/bert-new-base")
-        bert_test = '''class TFBertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = BertConfig
-    load_tf_weights = load_tf_weights_in_bert
-    base_model_prefix = "bert"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-
-BERT_CONSTANT = "value"
-'''
-        bert_expected = '''class TFNewBertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = NewBertConfig
-    load_tf_weights = load_tf_weights_in_new_bert
-    base_model_prefix = "new_bert"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-
-NEW_BERT_CONSTANT = "value"
-'''
-        bert_expected_with_copied_from = (
-            "# Copied from transformers.bert_module.TFBertPreTrainedModel with Bert->NewBert,bert->new_bert\n"
-            + bert_expected
-        )
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            work_dir = os.path.join(tmp_dir, "transformers")
-            os.makedirs(work_dir)
-            file_name = os.path.join(work_dir, "bert_module.py")
-            dest_file_name = os.path.join(work_dir, "new_bert_module.py")
-
-            self.init_file(file_name, bert_test)
-            duplicate_module(file_name, bert_model_patterns, new_bert_model_patterns)
-            self.check_result(dest_file_name, bert_expected_with_copied_from)
-
-            self.init_file(file_name, bert_test)
-            duplicate_module(file_name, bert_model_patterns, new_bert_model_patterns, add_copied_from=False)
-            self.check_result(dest_file_name, bert_expected)
-
-    def test_duplicate_module_with_copied_from(self):
-        bert_model_patterns = ModelPatterns("Bert", "bert-base-cased")
-        new_bert_model_patterns = ModelPatterns("New Bert", "huggingface/bert-new-base")
-        bert_test = '''# Copied from transformers.models.xxx.XxxModel with Xxx->Bert
-class TFBertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = BertConfig
-    load_tf_weights = load_tf_weights_in_bert
-    base_model_prefix = "bert"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-
-BERT_CONSTANT = "value"
-'''
-        bert_expected = '''# Copied from transformers.models.xxx.XxxModel with Xxx->NewBert
-class TFNewBertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = NewBertConfig
-    load_tf_weights = load_tf_weights_in_new_bert
-    base_model_prefix = "new_bert"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-
-NEW_BERT_CONSTANT = "value"
-'''
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            work_dir = os.path.join(tmp_dir, "transformers")
-            os.makedirs(work_dir)
-            file_name = os.path.join(work_dir, "bert_module.py")
-            dest_file_name = os.path.join(work_dir, "new_bert_module.py")
-
-            self.init_file(file_name, bert_test)
-            duplicate_module(file_name, bert_model_patterns, new_bert_model_patterns)
-            # There should not be a new Copied from statement, the old one should be adapated.
-            self.check_result(dest_file_name, bert_expected)
-
-            self.init_file(file_name, bert_test)
-            duplicate_module(file_name, bert_model_patterns, new_bert_model_patterns, add_copied_from=False)
-            self.check_result(dest_file_name, bert_expected)
-
-    def test_filter_framework_files(self):
-        files = ["modeling_bert.py", "modeling_tf_bert.py", "modeling_flax_bert.py", "configuration_bert.py"]
-        self.assertEqual(filter_framework_files(files), files)
-        self.assertEqual(set(filter_framework_files(files, ["pt", "tf", "flax"])), set(files))
-
-        self.assertEqual(set(filter_framework_files(files, ["pt"])), {"modeling_bert.py", "configuration_bert.py"})
-        self.assertEqual(set(filter_framework_files(files, ["tf"])), {"modeling_tf_bert.py", "configuration_bert.py"})
-        self.assertEqual(
-            set(filter_framework_files(files, ["flax"])), {"modeling_flax_bert.py", "configuration_bert.py"}
-        )
-
-        self.assertEqual(
-            set(filter_framework_files(files, ["pt", "tf"])),
-            {"modeling_tf_bert.py", "modeling_bert.py", "configuration_bert.py"},
-        )
-        self.assertEqual(
-            set(filter_framework_files(files, ["tf", "flax"])),
-            {"modeling_tf_bert.py", "modeling_flax_bert.py", "configuration_bert.py"},
-        )
-        self.assertEqual(
-            set(filter_framework_files(files, ["pt", "flax"])),
-            {"modeling_bert.py", "modeling_flax_bert.py", "configuration_bert.py"},
-        )
-
-    def test_get_model_files(self):
-        # BERT
-        bert_files = get_model_files("bert")
-
-        doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
-        self.assertEqual(model_files, BERT_MODEL_FILES)
-
-        self.assertEqual(bert_files["module_name"], "bert")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["test_files"]}
-        bert_test_files = {
-            "tests/models/bert/test_tokenization_bert.py",
-            "tests/models/bert/test_modeling_bert.py",
-            "tests/models/bert/test_modeling_tf_bert.py",
-            "tests/models/bert/test_modeling_flax_bert.py",
-        }
-        self.assertEqual(test_files, bert_test_files)
-
-        # VIT
-        vit_files = get_model_files("vit")
-        doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
-        self.assertEqual(model_files, VIT_MODEL_FILES)
-
-        self.assertEqual(vit_files["module_name"], "vit")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["test_files"]}
-        vit_test_files = {
-            "tests/models/vit/test_image_processing_vit.py",
-            "tests/models/vit/test_modeling_vit.py",
-            "tests/models/vit/test_modeling_tf_vit.py",
-            "tests/models/vit/test_modeling_flax_vit.py",
-        }
-        self.assertEqual(test_files, vit_test_files)
-
-        # Wav2Vec2
-        wav2vec2_files = get_model_files("wav2vec2")
-        doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
-        self.assertEqual(model_files, WAV2VEC2_MODEL_FILES)
-
-        self.assertEqual(wav2vec2_files["module_name"], "wav2vec2")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["test_files"]}
-        wav2vec2_test_files = {
-            "tests/models/wav2vec2/test_feature_extraction_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_tf_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_flax_wav2vec2.py",
-            "tests/models/wav2vec2/test_processor_wav2vec2.py",
-            "tests/models/wav2vec2/test_tokenization_wav2vec2.py",
-        }
-        self.assertEqual(test_files, wav2vec2_test_files)
-
-    def test_get_model_files_only_pt(self):
-        # BERT
-        bert_files = get_model_files("bert", frameworks=["pt"])
-
-        doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
-        bert_model_files = BERT_MODEL_FILES - {
-            "src/transformers/models/bert/modeling_tf_bert.py",
-            "src/transformers/models/bert/modeling_flax_bert.py",
-        }
-        self.assertEqual(model_files, bert_model_files)
-
-        self.assertEqual(bert_files["module_name"], "bert")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["test_files"]}
-        bert_test_files = {
-            "tests/models/bert/test_tokenization_bert.py",
-            "tests/models/bert/test_modeling_bert.py",
-        }
-        self.assertEqual(test_files, bert_test_files)
-
-        # VIT
-        vit_files = get_model_files("vit", frameworks=["pt"])
-        doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
-        vit_model_files = VIT_MODEL_FILES - {
-            "src/transformers/models/vit/modeling_tf_vit.py",
-            "src/transformers/models/vit/modeling_flax_vit.py",
-        }
-        self.assertEqual(model_files, vit_model_files)
-
-        self.assertEqual(vit_files["module_name"], "vit")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["test_files"]}
-        vit_test_files = {
-            "tests/models/vit/test_image_processing_vit.py",
-            "tests/models/vit/test_modeling_vit.py",
-        }
-        self.assertEqual(test_files, vit_test_files)
-
-        # Wav2Vec2
-        wav2vec2_files = get_model_files("wav2vec2", frameworks=["pt"])
-        doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
-        wav2vec2_model_files = WAV2VEC2_MODEL_FILES - {
-            "src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py",
-            "src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py",
-        }
-        self.assertEqual(model_files, wav2vec2_model_files)
-
-        self.assertEqual(wav2vec2_files["module_name"], "wav2vec2")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["test_files"]}
-        wav2vec2_test_files = {
-            "tests/models/wav2vec2/test_feature_extraction_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_wav2vec2.py",
-            "tests/models/wav2vec2/test_processor_wav2vec2.py",
-            "tests/models/wav2vec2/test_tokenization_wav2vec2.py",
-        }
-        self.assertEqual(test_files, wav2vec2_test_files)
-
-    def test_get_model_files_tf_and_flax(self):
-        # BERT
-        bert_files = get_model_files("bert", frameworks=["tf", "flax"])
-
-        doc_file = str(Path(bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["model_files"]}
-        bert_model_files = BERT_MODEL_FILES - {"src/transformers/models/bert/modeling_bert.py"}
-        self.assertEqual(model_files, bert_model_files)
-
-        self.assertEqual(bert_files["module_name"], "bert")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in bert_files["test_files"]}
-        bert_test_files = {
-            "tests/models/bert/test_tokenization_bert.py",
-            "tests/models/bert/test_modeling_tf_bert.py",
-            "tests/models/bert/test_modeling_flax_bert.py",
-        }
-        self.assertEqual(test_files, bert_test_files)
-
-        # VIT
-        vit_files = get_model_files("vit", frameworks=["tf", "flax"])
-        doc_file = str(Path(vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["model_files"]}
-        vit_model_files = VIT_MODEL_FILES - {"src/transformers/models/vit/modeling_vit.py"}
-        self.assertEqual(model_files, vit_model_files)
-
-        self.assertEqual(vit_files["module_name"], "vit")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in vit_files["test_files"]}
-        vit_test_files = {
-            "tests/models/vit/test_image_processing_vit.py",
-            "tests/models/vit/test_modeling_tf_vit.py",
-            "tests/models/vit/test_modeling_flax_vit.py",
-        }
-        self.assertEqual(test_files, vit_test_files)
-
-        # Wav2Vec2
-        wav2vec2_files = get_model_files("wav2vec2", frameworks=["tf", "flax"])
-        doc_file = str(Path(wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
-
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["model_files"]}
-        wav2vec2_model_files = WAV2VEC2_MODEL_FILES - {"src/transformers/models/wav2vec2/modeling_wav2vec2.py"}
-        self.assertEqual(model_files, wav2vec2_model_files)
-
-        self.assertEqual(wav2vec2_files["module_name"], "wav2vec2")
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in wav2vec2_files["test_files"]}
-        wav2vec2_test_files = {
-            "tests/models/wav2vec2/test_feature_extraction_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_tf_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_flax_wav2vec2.py",
-            "tests/models/wav2vec2/test_processor_wav2vec2.py",
-            "tests/models/wav2vec2/test_tokenization_wav2vec2.py",
-        }
-        self.assertEqual(test_files, wav2vec2_test_files)
-
-    def test_find_base_model_checkpoint(self):
-        self.assertEqual(find_base_model_checkpoint("bert"), "bert-base-uncased")
-        self.assertEqual(find_base_model_checkpoint("gpt2"), "gpt2")
-
-    def test_retrieve_model_classes(self):
-        gpt_classes = {k: set(v) for k, v in retrieve_model_classes("gpt2").items()}
-        expected_gpt_classes = {
-            "pt": {"GPT2ForTokenClassification", "GPT2Model", "GPT2LMHeadModel", "GPT2ForSequenceClassification"},
-            "tf": {"TFGPT2Model", "TFGPT2ForSequenceClassification", "TFGPT2LMHeadModel"},
-            "flax": {"FlaxGPT2Model", "FlaxGPT2LMHeadModel"},
-        }
-        self.assertEqual(gpt_classes, expected_gpt_classes)
-
-        del expected_gpt_classes["flax"]
-        gpt_classes = {k: set(v) for k, v in retrieve_model_classes("gpt2", frameworks=["pt", "tf"]).items()}
-        self.assertEqual(gpt_classes, expected_gpt_classes)
-
-        del expected_gpt_classes["pt"]
-        gpt_classes = {k: set(v) for k, v in retrieve_model_classes("gpt2", frameworks=["tf"]).items()}
-        self.assertEqual(gpt_classes, expected_gpt_classes)
-
-    def test_retrieve_info_for_model_with_bert(self):
-        bert_info = retrieve_info_for_model("bert")
-        bert_classes = [
-            "BertForTokenClassification",
-            "BertForQuestionAnswering",
-            "BertForNextSentencePrediction",
-            "BertForSequenceClassification",
-            "BertForMaskedLM",
-            "BertForMultipleChoice",
-            "BertModel",
-            "BertForPreTraining",
-            "BertLMHeadModel",
-        ]
-        expected_model_classes = {
-            "pt": set(bert_classes),
-            "tf": {f"TF{m}" for m in bert_classes},
-            "flax": {f"Flax{m}" for m in bert_classes[:-1] + ["BertForCausalLM"]},
-        }
-
-        self.assertEqual(set(bert_info["frameworks"]), {"pt", "tf", "flax"})
-        model_classes = {k: set(v) for k, v in bert_info["model_classes"].items()}
-        self.assertEqual(model_classes, expected_model_classes)
-
-        all_bert_files = bert_info["model_files"]
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_bert_files["model_files"]}
-        self.assertEqual(model_files, BERT_MODEL_FILES)
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_bert_files["test_files"]}
-        bert_test_files = {
-            "tests/models/bert/test_tokenization_bert.py",
-            "tests/models/bert/test_modeling_bert.py",
-            "tests/models/bert/test_modeling_tf_bert.py",
-            "tests/models/bert/test_modeling_flax_bert.py",
-        }
-        self.assertEqual(test_files, bert_test_files)
-
-        doc_file = str(Path(all_bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
-
-        self.assertEqual(all_bert_files["module_name"], "bert")
-
-        bert_model_patterns = bert_info["model_patterns"]
-        self.assertEqual(bert_model_patterns.model_name, "BERT")
-        self.assertEqual(bert_model_patterns.checkpoint, "bert-base-uncased")
-        self.assertEqual(bert_model_patterns.model_type, "bert")
-        self.assertEqual(bert_model_patterns.model_lower_cased, "bert")
-        self.assertEqual(bert_model_patterns.model_camel_cased, "Bert")
-        self.assertEqual(bert_model_patterns.model_upper_cased, "BERT")
-        self.assertEqual(bert_model_patterns.config_class, "BertConfig")
-        self.assertEqual(bert_model_patterns.tokenizer_class, "BertTokenizer")
-        self.assertIsNone(bert_model_patterns.feature_extractor_class)
-        self.assertIsNone(bert_model_patterns.processor_class)
-
-    def test_retrieve_info_for_model_pt_tf_with_bert(self):
-        bert_info = retrieve_info_for_model("bert", frameworks=["pt", "tf"])
-        bert_classes = [
-            "BertForTokenClassification",
-            "BertForQuestionAnswering",
-            "BertForNextSentencePrediction",
-            "BertForSequenceClassification",
-            "BertForMaskedLM",
-            "BertForMultipleChoice",
-            "BertModel",
-            "BertForPreTraining",
-            "BertLMHeadModel",
-        ]
-        expected_model_classes = {"pt": set(bert_classes), "tf": {f"TF{m}" for m in bert_classes}}
-
-        self.assertEqual(set(bert_info["frameworks"]), {"pt", "tf"})
-        model_classes = {k: set(v) for k, v in bert_info["model_classes"].items()}
-        self.assertEqual(model_classes, expected_model_classes)
-
-        all_bert_files = bert_info["model_files"]
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_bert_files["model_files"]}
-        bert_model_files = BERT_MODEL_FILES - {"src/transformers/models/bert/modeling_flax_bert.py"}
-        self.assertEqual(model_files, bert_model_files)
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_bert_files["test_files"]}
-        bert_test_files = {
-            "tests/models/bert/test_tokenization_bert.py",
-            "tests/models/bert/test_modeling_bert.py",
-            "tests/models/bert/test_modeling_tf_bert.py",
-        }
-        self.assertEqual(test_files, bert_test_files)
-
-        doc_file = str(Path(all_bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
-
-        self.assertEqual(all_bert_files["module_name"], "bert")
-
-        bert_model_patterns = bert_info["model_patterns"]
-        self.assertEqual(bert_model_patterns.model_name, "BERT")
-        self.assertEqual(bert_model_patterns.checkpoint, "bert-base-uncased")
-        self.assertEqual(bert_model_patterns.model_type, "bert")
-        self.assertEqual(bert_model_patterns.model_lower_cased, "bert")
-        self.assertEqual(bert_model_patterns.model_camel_cased, "Bert")
-        self.assertEqual(bert_model_patterns.model_upper_cased, "BERT")
-        self.assertEqual(bert_model_patterns.config_class, "BertConfig")
-        self.assertEqual(bert_model_patterns.tokenizer_class, "BertTokenizer")
-        self.assertIsNone(bert_model_patterns.feature_extractor_class)
-        self.assertIsNone(bert_model_patterns.processor_class)
-
-    def test_retrieve_info_for_model_with_vit(self):
-        vit_info = retrieve_info_for_model("vit")
-        vit_classes = ["ViTForImageClassification", "ViTModel"]
-        pt_only_classes = ["ViTForMaskedImageModeling"]
-        expected_model_classes = {
-            "pt": set(vit_classes + pt_only_classes),
-            "tf": {f"TF{m}" for m in vit_classes},
-            "flax": {f"Flax{m}" for m in vit_classes},
-        }
-
-        self.assertEqual(set(vit_info["frameworks"]), {"pt", "tf", "flax"})
-        model_classes = {k: set(v) for k, v in vit_info["model_classes"].items()}
-        self.assertEqual(model_classes, expected_model_classes)
-
-        all_vit_files = vit_info["model_files"]
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_vit_files["model_files"]}
-        self.assertEqual(model_files, VIT_MODEL_FILES)
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_vit_files["test_files"]}
-        vit_test_files = {
-            "tests/models/vit/test_image_processing_vit.py",
-            "tests/models/vit/test_modeling_vit.py",
-            "tests/models/vit/test_modeling_tf_vit.py",
-            "tests/models/vit/test_modeling_flax_vit.py",
-        }
-        self.assertEqual(test_files, vit_test_files)
-
-        doc_file = str(Path(all_vit_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/vit.md")
-
-        self.assertEqual(all_vit_files["module_name"], "vit")
-
-        vit_model_patterns = vit_info["model_patterns"]
-        self.assertEqual(vit_model_patterns.model_name, "ViT")
-        self.assertEqual(vit_model_patterns.checkpoint, "google/vit-base-patch16-224-in21k")
-        self.assertEqual(vit_model_patterns.model_type, "vit")
-        self.assertEqual(vit_model_patterns.model_lower_cased, "vit")
-        self.assertEqual(vit_model_patterns.model_camel_cased, "ViT")
-        self.assertEqual(vit_model_patterns.model_upper_cased, "VIT")
-        self.assertEqual(vit_model_patterns.config_class, "ViTConfig")
-        self.assertEqual(vit_model_patterns.feature_extractor_class, "ViTFeatureExtractor")
-        self.assertEqual(vit_model_patterns.image_processor_class, "ViTImageProcessor")
-        self.assertIsNone(vit_model_patterns.tokenizer_class)
-        self.assertIsNone(vit_model_patterns.processor_class)
-
-    def test_retrieve_info_for_model_with_wav2vec2(self):
-        wav2vec2_info = retrieve_info_for_model("wav2vec2")
-        wav2vec2_classes = [
-            "Wav2Vec2Model",
-            "Wav2Vec2ForPreTraining",
-            "Wav2Vec2ForAudioFrameClassification",
-            "Wav2Vec2ForCTC",
-            "Wav2Vec2ForMaskedLM",
-            "Wav2Vec2ForSequenceClassification",
-            "Wav2Vec2ForXVector",
-        ]
-        expected_model_classes = {
-            "pt": set(wav2vec2_classes),
-            "tf": {f"TF{m}" for m in wav2vec2_classes[:1]},
-            "flax": {f"Flax{m}" for m in wav2vec2_classes[:2]},
-        }
-
-        self.assertEqual(set(wav2vec2_info["frameworks"]), {"pt", "tf", "flax"})
-        model_classes = {k: set(v) for k, v in wav2vec2_info["model_classes"].items()}
-        self.assertEqual(model_classes, expected_model_classes)
-
-        all_wav2vec2_files = wav2vec2_info["model_files"]
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_wav2vec2_files["model_files"]}
-        self.assertEqual(model_files, WAV2VEC2_MODEL_FILES)
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_wav2vec2_files["test_files"]}
-        wav2vec2_test_files = {
-            "tests/models/wav2vec2/test_feature_extraction_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_tf_wav2vec2.py",
-            "tests/models/wav2vec2/test_modeling_flax_wav2vec2.py",
-            "tests/models/wav2vec2/test_processor_wav2vec2.py",
-            "tests/models/wav2vec2/test_tokenization_wav2vec2.py",
-        }
-        self.assertEqual(test_files, wav2vec2_test_files)
-
-        doc_file = str(Path(all_wav2vec2_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/wav2vec2.md")
-
-        self.assertEqual(all_wav2vec2_files["module_name"], "wav2vec2")
-
-        wav2vec2_model_patterns = wav2vec2_info["model_patterns"]
-        self.assertEqual(wav2vec2_model_patterns.model_name, "Wav2Vec2")
-        self.assertEqual(wav2vec2_model_patterns.checkpoint, "facebook/wav2vec2-base-960h")
-        self.assertEqual(wav2vec2_model_patterns.model_type, "wav2vec2")
-        self.assertEqual(wav2vec2_model_patterns.model_lower_cased, "wav2vec2")
-        self.assertEqual(wav2vec2_model_patterns.model_camel_cased, "Wav2Vec2")
-        self.assertEqual(wav2vec2_model_patterns.model_upper_cased, "WAV_2_VEC_2")
-        self.assertEqual(wav2vec2_model_patterns.config_class, "Wav2Vec2Config")
-        self.assertEqual(wav2vec2_model_patterns.feature_extractor_class, "Wav2Vec2FeatureExtractor")
-        self.assertEqual(wav2vec2_model_patterns.processor_class, "Wav2Vec2Processor")
-        self.assertEqual(wav2vec2_model_patterns.tokenizer_class, "Wav2Vec2CTCTokenizer")
-
-    def test_clean_frameworks_in_init_with_gpt(self):
-        test_init = """
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
-
-_import_structure = {
-    "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2OnnxConfig"],
-    "tokenization_gpt2": ["GPT2Tokenizer"],
-}
-
-try:
-    if not is_tokenizers_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tokenization_gpt2_fast"] = ["GPT2TokenizerFast"]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_gpt2"] = ["GPT2Model"]
-
-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_tf_gpt2"] = ["TFGPT2Model"]
-
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_flax_gpt2"] = ["FlaxGPT2Model"]
-
-if TYPE_CHECKING:
-    from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2OnnxConfig
-    from .tokenization_gpt2 import GPT2Tokenizer
-
-    try:
-        if not is_tokenizers_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tokenization_gpt2_fast import GPT2TokenizerFast
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_gpt2 import GPT2Model
-
-    try:
-        if not is_tf_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_tf_gpt2 import TFGPT2Model
-
-    try:
-        if not is_flax_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_flax_gpt2 import FlaxGPT2Model
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
-"""
-
-        init_no_tokenizer = """
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
-
-_import_structure = {
-    "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2OnnxConfig"],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_gpt2"] = ["GPT2Model"]
-
-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_tf_gpt2"] = ["TFGPT2Model"]
-
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_flax_gpt2"] = ["FlaxGPT2Model"]
-
-if TYPE_CHECKING:
-    from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2OnnxConfig
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_gpt2 import GPT2Model
-
-    try:
-        if not is_tf_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_tf_gpt2 import TFGPT2Model
-
-    try:
-        if not is_flax_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_flax_gpt2 import FlaxGPT2Model
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
-"""
-
-        init_pt_only = """
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
-
-_import_structure = {
-    "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2OnnxConfig"],
-    "tokenization_gpt2": ["GPT2Tokenizer"],
-}
-
-try:
-    if not is_tokenizers_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tokenization_gpt2_fast"] = ["GPT2TokenizerFast"]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_gpt2"] = ["GPT2Model"]
-
-if TYPE_CHECKING:
-    from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2OnnxConfig
-    from .tokenization_gpt2 import GPT2Tokenizer
-
-    try:
-        if not is_tokenizers_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tokenization_gpt2_fast import GPT2TokenizerFast
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_gpt2 import GPT2Model
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
-"""
-
-        init_pt_only_no_tokenizer = """
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, is_torch_available
-
-_import_structure = {
-    "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2OnnxConfig"],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_gpt2"] = ["GPT2Model"]
-
-if TYPE_CHECKING:
-    from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2OnnxConfig
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_gpt2 import GPT2Model
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
-"""
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            file_name = os.path.join(tmp_dir, "../__init__.py")
-
-            self.init_file(file_name, test_init)
-            clean_frameworks_in_init(file_name, keep_processing=False)
-            self.check_result(file_name, init_no_tokenizer)
-
-            self.init_file(file_name, test_init)
-            clean_frameworks_in_init(file_name, frameworks=["pt"])
-            self.check_result(file_name, init_pt_only)
-
-            self.init_file(file_name, test_init)
-            clean_frameworks_in_init(file_name, frameworks=["pt"], keep_processing=False)
-            self.check_result(file_name, init_pt_only_no_tokenizer)
-
-    def test_clean_frameworks_in_init_with_vit(self):
-        test_init = """
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vision_available
-
-_import_structure = {
-    "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
-}
-
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["image_processing_vit"] = ["ViTImageProcessor"]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_vit"] = ["ViTModel"]
-
-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_tf_vit"] = ["TFViTModel"]
-
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_flax_vit"] = ["FlaxViTModel"]
-
-if TYPE_CHECKING:
-    from .configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
-
-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .image_processing_vit import ViTImageProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_vit import ViTModel
-
-    try:
-        if not is_tf_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_tf_vit import TFViTModel
-
-    try:
-        if not is_flax_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_flax_vit import FlaxViTModel
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
-"""
-
-        init_no_feature_extractor = """
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
-
-_import_structure = {
-    "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_vit"] = ["ViTModel"]
-
-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_tf_vit"] = ["TFViTModel"]
-
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_flax_vit"] = ["FlaxViTModel"]
-
-if TYPE_CHECKING:
-    from .configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_vit import ViTModel
-
-    try:
-        if not is_tf_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_tf_vit import TFViTModel
-
-    try:
-        if not is_flax_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_flax_vit import FlaxViTModel
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
-"""
-
-        init_pt_only = """
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, is_torch_available, is_vision_available
-
-_import_structure = {
-    "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
-}
-
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["image_processing_vit"] = ["ViTImageProcessor"]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_vit"] = ["ViTModel"]
-
-if TYPE_CHECKING:
-    from .configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
-
-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .image_processing_vit import ViTImageProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_vit import ViTModel
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
-"""
-
-        init_pt_only_no_feature_extractor = """
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule, is_torch_available
-
-_import_structure = {
-    "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_vit"] = ["ViTModel"]
-
-if TYPE_CHECKING:
-    from .configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_vit import ViTModel
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
-"""
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            file_name = os.path.join(tmp_dir, "../__init__.py")
-
-            self.init_file(file_name, test_init)
-            clean_frameworks_in_init(file_name, keep_processing=False)
-            self.check_result(file_name, init_no_feature_extractor)
-
-            self.init_file(file_name, test_init)
-            clean_frameworks_in_init(file_name, frameworks=["pt"])
-            self.check_result(file_name, init_pt_only)
-
-            self.init_file(file_name, test_init)
-            clean_frameworks_in_init(file_name, frameworks=["pt"], keep_processing=False)
-            self.check_result(file_name, init_pt_only_no_feature_extractor)
-
-    def test_duplicate_doc_file(self):
-        test_doc = """
-# GPT2
-
-## Overview
-
-Overview of the model.
-
-## GPT2Config
-
-[[autodoc]] GPT2Config
-
-## GPT2Tokenizer
-
-[[autodoc]] GPT2Tokenizer
-    - save_vocabulary
-
-## GPT2TokenizerFast
-
-[[autodoc]] GPT2TokenizerFast
-
-## GPT2 specific outputs
-
-[[autodoc]] models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput
-
-[[autodoc]] models.gpt2.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
-
-## GPT2Model
-
-[[autodoc]] GPT2Model
-    - forward
-
-## TFGPT2Model
-
-[[autodoc]] TFGPT2Model
-    - call
-
-## FlaxGPT2Model
-
-[[autodoc]] FlaxGPT2Model
-    - __call__
-
-"""
-        test_new_doc = """
-# GPT-New New
-
-## Overview
-
-The GPT-New New model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
-
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
-
-
-## GPTNewNewConfig
-
-[[autodoc]] GPTNewNewConfig
-
-## GPTNewNewTokenizer
-
-[[autodoc]] GPTNewNewTokenizer
-    - save_vocabulary
-
-## GPTNewNewTokenizerFast
-
-[[autodoc]] GPTNewNewTokenizerFast
-
-## GPTNewNew specific outputs
-
-[[autodoc]] models.gpt_new_new.modeling_gpt_new_new.GPTNewNewDoubleHeadsModelOutput
-
-[[autodoc]] models.gpt_new_new.modeling_tf_gpt_new_new.TFGPTNewNewDoubleHeadsModelOutput
-
-## GPTNewNewModel
-
-[[autodoc]] GPTNewNewModel
-    - forward
-
-## TFGPTNewNewModel
-
-[[autodoc]] TFGPTNewNewModel
-    - call
-
-## FlaxGPTNewNewModel
-
-[[autodoc]] FlaxGPTNewNewModel
-    - __call__
-
-"""
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            doc_file = os.path.join(tmp_dir, "gpt2.md")
-            new_doc_file = os.path.join(tmp_dir, "gpt-new-new.md")
-
-            gpt2_model_patterns = ModelPatterns("GPT2", "gpt2", tokenizer_class="GPT2Tokenizer")
-            new_model_patterns = ModelPatterns(
-                "GPT-New New", "huggingface/gpt-new-new", tokenizer_class="GPTNewNewTokenizer"
-            )
-
-            self.init_file(doc_file, test_doc)
-            duplicate_doc_file(doc_file, gpt2_model_patterns, new_model_patterns)
-            self.check_result(new_doc_file, test_new_doc)
-
-            test_new_doc_pt_only = test_new_doc.replace(
-                """
-## TFGPTNewNewModel
-
-[[autodoc]] TFGPTNewNewModel
-    - call
-
-## FlaxGPTNewNewModel
-
-[[autodoc]] FlaxGPTNewNewModel
-    - __call__
-
-""",
-                "",
-            )
-            self.init_file(doc_file, test_doc)
-            duplicate_doc_file(doc_file, gpt2_model_patterns, new_model_patterns, frameworks=["pt"])
-            self.check_result(new_doc_file, test_new_doc_pt_only)
-
-            test_new_doc_no_tok = test_new_doc.replace(
-                """
-## GPTNewNewTokenizer
-
-[[autodoc]] GPTNewNewTokenizer
-    - save_vocabulary
-
-## GPTNewNewTokenizerFast
-
-[[autodoc]] GPTNewNewTokenizerFast
-""",
-                "",
-            )
-            new_model_patterns = ModelPatterns(
-                "GPT-New New", "huggingface/gpt-new-new", tokenizer_class="GPT2Tokenizer"
-            )
-            self.init_file(doc_file, test_doc)
-            duplicate_doc_file(doc_file, gpt2_model_patterns, new_model_patterns)
-            print(test_new_doc_no_tok)
-            self.check_result(new_doc_file, test_new_doc_no_tok)
-
-            test_new_doc_pt_only_no_tok = test_new_doc_no_tok.replace(
-                """
-## TFGPTNewNewModel
-
-[[autodoc]] TFGPTNewNewModel
-    - call
-
-## FlaxGPTNewNewModel
-
-[[autodoc]] FlaxGPTNewNewModel
-    - __call__
-
-""",
-                "",
-            )
-            self.init_file(doc_file, test_doc)
-            duplicate_doc_file(doc_file, gpt2_model_patterns, new_model_patterns, frameworks=["pt"])
-            self.check_result(new_doc_file, test_new_doc_pt_only_no_tok)
diff --git a/tests/transformers/tests/utils/test_audio_utils.py b/tests/transformers/tests/utils/test_audio_utils.py
deleted file mode 100644
index f81a0dffd5..0000000000
--- a/tests/transformers/tests/utils/test_audio_utils.py
+++ /dev/null
@@ -1,651 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import pytest
-from transformers.audio_utils import (
-    amplitude_to_db,
-    hertz_to_mel,
-    mel_filter_bank,
-    mel_to_hertz,
-    power_to_db,
-    spectrogram,
-    window_function,
-)
-
-
-class AudioUtilsFunctionTester(unittest.TestCase):
-    def test_hertz_to_mel(self):
-        self.assertEqual(hertz_to_mel(0.0), 0.0)
-        self.assertAlmostEqual(hertz_to_mel(100), 150.48910241)
-
-        inputs = np.array([100, 200])
-        expected = np.array([150.48910241, 283.22989816])
-        self.assertTrue(np.allclose(hertz_to_mel(inputs), expected))
-
-        self.assertEqual(hertz_to_mel(0.0, "slaney"), 0.0)
-        self.assertEqual(hertz_to_mel(100, "slaney"), 1.5)
-
-        inputs = np.array([60, 100, 200, 1000, 1001, 2000])
-        expected = np.array([0.9, 1.5, 3.0, 15.0, 15.01453781, 25.08188016])
-        self.assertTrue(np.allclose(hertz_to_mel(inputs, "slaney"), expected))
-
-        with pytest.raises(ValueError):
-            hertz_to_mel(100, mel_scale=None)
-
-    def test_mel_to_hertz(self):
-        self.assertEqual(mel_to_hertz(0.0), 0.0)
-        self.assertAlmostEqual(mel_to_hertz(150.48910241), 100)
-
-        inputs = np.array([150.48910241, 283.22989816])
-        expected = np.array([100, 200])
-        self.assertTrue(np.allclose(mel_to_hertz(inputs), expected))
-
-        self.assertEqual(mel_to_hertz(0.0, "slaney"), 0.0)
-        self.assertEqual(mel_to_hertz(1.5, "slaney"), 100)
-
-        inputs = np.array([0.9, 1.5, 3.0, 15.0, 15.01453781, 25.08188016])
-        expected = np.array([60, 100, 200, 1000, 1001, 2000])
-        self.assertTrue(np.allclose(mel_to_hertz(inputs, "slaney"), expected))
-
-        with pytest.raises(ValueError):
-            mel_to_hertz(100, mel_scale=None)
-
-    def test_mel_filter_bank_shape(self):
-        mel_filters = mel_filter_bank(
-            num_frequency_bins=513,
-            num_mel_filters=13,
-            min_frequency=100,
-            max_frequency=4000,
-            sampling_rate=16000,
-            norm=None,
-            mel_scale="htk",
-        )
-        self.assertEqual(mel_filters.shape, (513, 13))
-
-        mel_filters = mel_filter_bank(
-            num_frequency_bins=513,
-            num_mel_filters=13,
-            min_frequency=100,
-            max_frequency=4000,
-            sampling_rate=16000,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-        self.assertEqual(mel_filters.shape, (513, 13))
-
-    def test_mel_filter_bank_htk(self):
-        mel_filters = mel_filter_bank(
-            num_frequency_bins=16,
-            num_mel_filters=4,
-            min_frequency=0,
-            max_frequency=2000,
-            sampling_rate=4000,
-            norm=None,
-            mel_scale="htk",
-        )
-        # fmt: off
-        expected = np.array([
-            [0.0       , 0.0       , 0.0       , 0.0       ],
-            [0.61454786, 0.0       , 0.0       , 0.0       ],
-            [0.82511046, 0.17488954, 0.0       , 0.0       ],
-            [0.35597035, 0.64402965, 0.0       , 0.0       ],
-            [0.0       , 0.91360726, 0.08639274, 0.0       ],
-            [0.0       , 0.55547007, 0.44452993, 0.0       ],
-            [0.0       , 0.19733289, 0.80266711, 0.0       ],
-            [0.0       , 0.0       , 0.87724349, 0.12275651],
-            [0.0       , 0.0       , 0.6038449 , 0.3961551 ],
-            [0.0       , 0.0       , 0.33044631, 0.66955369],
-            [0.0       , 0.0       , 0.05704771, 0.94295229],
-            [0.0       , 0.0       , 0.0       , 0.83483975],
-            [0.0       , 0.0       , 0.0       , 0.62612982],
-            [0.0       , 0.0       , 0.0       , 0.41741988],
-            [0.0       , 0.0       , 0.0       , 0.20870994],
-            [0.0       , 0.0       , 0.0       , 0.0       ]
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(mel_filters, expected))
-
-    def test_mel_filter_bank_slaney(self):
-        mel_filters = mel_filter_bank(
-            num_frequency_bins=16,
-            num_mel_filters=4,
-            min_frequency=0,
-            max_frequency=2000,
-            sampling_rate=4000,
-            norm=None,
-            mel_scale="slaney",
-        )
-        # fmt: off
-        expected = np.array([
-            [0.0       , 0.0       , 0.0       , 0.0       ],
-            [0.39869419, 0.0       , 0.0       , 0.0       ],
-            [0.79738839, 0.0       , 0.0       , 0.0       ],
-            [0.80391742, 0.19608258, 0.0       , 0.0       ],
-            [0.40522322, 0.59477678, 0.0       , 0.0       ],
-            [0.00652903, 0.99347097, 0.0       , 0.0       ],
-            [0.0       , 0.60796161, 0.39203839, 0.0       ],
-            [0.0       , 0.20939631, 0.79060369, 0.0       ],
-            [0.0       , 0.0       , 0.84685344, 0.15314656],
-            [0.0       , 0.0       , 0.52418477, 0.47581523],
-            [0.0       , 0.0       , 0.2015161 , 0.7984839 ],
-            [0.0       , 0.0       , 0.0       , 0.9141874 ],
-            [0.0       , 0.0       , 0.0       , 0.68564055],
-            [0.0       , 0.0       , 0.0       , 0.4570937 ],
-            [0.0       , 0.0       , 0.0       , 0.22854685],
-            [0.0       , 0.0       , 0.0       , 0.0       ]
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(mel_filters, expected))
-
-    def test_mel_filter_bank_slaney_norm(self):
-        mel_filters = mel_filter_bank(
-            num_frequency_bins=16,
-            num_mel_filters=4,
-            min_frequency=0,
-            max_frequency=2000,
-            sampling_rate=4000,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-        # fmt: off
-        expected = np.array([
-            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
-            [1.19217795e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
-            [2.38435591e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
-            [2.40387905e-03, 5.86232616e-04, 0.00000000e+00, 0.00000000e+00],
-            [1.21170110e-03, 1.77821783e-03, 0.00000000e+00, 0.00000000e+00],
-            [1.95231437e-05, 2.97020305e-03, 0.00000000e+00, 0.00000000e+00],
-            [0.00000000e+00, 1.81763684e-03, 1.04857612e-03, 0.00000000e+00],
-            [0.00000000e+00, 6.26036972e-04, 2.11460963e-03, 0.00000000e+00],
-            [0.00000000e+00, 0.00000000e+00, 2.26505954e-03, 3.07332945e-04],
-            [0.00000000e+00, 0.00000000e+00, 1.40202503e-03, 9.54861093e-04],
-            [0.00000000e+00, 0.00000000e+00, 5.38990521e-04, 1.60238924e-03],
-            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.83458185e-03],
-            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.37593638e-03],
-            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.17290923e-04],
-            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.58645462e-04],
-            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(mel_filters, expected))
-
-    def test_window_function(self):
-        window = window_function(16, "hann")
-        self.assertEqual(len(window), 16)
-
-        # fmt: off
-        expected = np.array([
-            0.0, 0.03806023, 0.14644661, 0.30865828, 0.5, 0.69134172, 0.85355339, 0.96193977,
-            1.0, 0.96193977, 0.85355339, 0.69134172, 0.5, 0.30865828, 0.14644661, 0.03806023,
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(window, expected))
-
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-        return [x["array"] for x in speech_samples]
-
-    def test_spectrogram_impulse(self):
-        waveform = np.zeros(40)
-        waveform[9] = 1.0  # impulse shifted in time
-
-        spec = spectrogram(
-            waveform,
-            window_function(12, "hann", frame_length=16),
-            frame_length=16,
-            hop_length=4,
-            power=1.0,
-            center=True,
-            pad_mode="reflect",
-            onesided=True,
-        )
-        self.assertEqual(spec.shape, (9, 11))
-
-        expected = np.array([[0.0, 0.0669873, 0.9330127, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
-        self.assertTrue(np.allclose(spec, expected))
-
-    def test_spectrogram_integration_test(self):
-        waveform = self._load_datasamples(1)[0]
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann", frame_length=512),
-            frame_length=512,
-            hop_length=128,
-            power=1.0,
-            center=True,
-            pad_mode="reflect",
-            onesided=True,
-        )
-        self.assertEqual(spec.shape, (257, 732))
-
-        # fmt: off
-        expected = np.array([
-            0.02464888, 0.04648664, 0.05872392, 0.02311783, 0.0327175 ,
-            0.02433643, 0.01198814, 0.02055709, 0.01559287, 0.01394357,
-            0.01299037, 0.01728045, 0.0254554 , 0.02486533, 0.02011792,
-            0.01755333, 0.02100457, 0.02337024, 0.01436963, 0.01464558,
-            0.0211017 , 0.0193489 , 0.01272165, 0.01858462, 0.03722598,
-            0.0456542 , 0.03281558, 0.00620586, 0.02226466, 0.03618042,
-            0.03508182, 0.02271432, 0.01051649, 0.01225771, 0.02315293,
-            0.02331886, 0.01417785, 0.0106844 , 0.01791214, 0.017177  ,
-            0.02125114, 0.05028201, 0.06830665, 0.05216664, 0.01963666,
-            0.06941418, 0.11513043, 0.12257859, 0.10948435, 0.08568069,
-            0.05509328, 0.05047818, 0.047112  , 0.05060737, 0.02982424,
-            0.02803827, 0.02933729, 0.01760491, 0.00587815, 0.02117637,
-            0.0293578 , 0.03452379, 0.02194803, 0.01676056,
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(spec[:64, 400], expected))
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann"),
-            frame_length=400,
-            hop_length=128,
-            fft_length=512,
-            power=1.0,
-            center=True,
-            pad_mode="reflect",
-            onesided=True,
-        )
-        self.assertEqual(spec.shape, (257, 732))
-        self.assertTrue(np.allclose(spec[:64, 400], expected))
-
-    def test_spectrogram_center_padding(self):
-        waveform = self._load_datasamples(1)[0]
-
-        spec = spectrogram(
-            waveform,
-            window_function(512, "hann"),
-            frame_length=512,
-            hop_length=128,
-            center=True,
-            pad_mode="reflect",
-        )
-        self.assertEqual(spec.shape, (257, 732))
-
-        # fmt: off
-        expected = np.array([
-            0.1287945 , 0.12792738, 0.08311573, 0.03155122, 0.02470202,
-            0.00727857, 0.00910694, 0.00686163, 0.01238981, 0.01473668,
-            0.00336144, 0.00370314, 0.00600871, 0.01120164, 0.01942998,
-            0.03132008, 0.0232842 , 0.01124642, 0.02754783, 0.02423725,
-            0.00147893, 0.00038027, 0.00112299, 0.00596233, 0.00571529,
-            0.02084235, 0.0231855 , 0.00810006, 0.01837943, 0.00651339,
-            0.00093931, 0.00067426, 0.01058399, 0.01270507, 0.00151734,
-            0.00331913, 0.00302416, 0.01081792, 0.00754549, 0.00148963,
-            0.00111943, 0.00152573, 0.00608017, 0.01749986, 0.01205949,
-            0.0143082 , 0.01910573, 0.00413786, 0.03916619, 0.09873404,
-            0.08302026, 0.02673891, 0.00401255, 0.01397392, 0.00751862,
-            0.01024884, 0.01544606, 0.00638907, 0.00623633, 0.0085103 ,
-            0.00217659, 0.00276204, 0.00260835, 0.00299299,
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(spec[:64, 0], expected))
-
-        spec = spectrogram(
-            waveform,
-            window_function(512, "hann"),
-            frame_length=512,
-            hop_length=128,
-            center=True,
-            pad_mode="constant",
-        )
-        self.assertEqual(spec.shape, (257, 732))
-
-        # fmt: off
-        expected = np.array([
-            0.06558744, 0.06889656, 0.06263352, 0.04264418, 0.03404115,
-            0.03244197, 0.02279134, 0.01646339, 0.01452216, 0.00826055,
-            0.00062093, 0.0031821 , 0.00419456, 0.00689327, 0.01106367,
-            0.01712119, 0.01721762, 0.00977533, 0.01606626, 0.02275621,
-            0.01727687, 0.00992739, 0.01217688, 0.01049927, 0.01022947,
-            0.01302475, 0.01166873, 0.01081812, 0.01057327, 0.00767912,
-            0.00429567, 0.00089625, 0.00654583, 0.00912084, 0.00700984,
-            0.00225026, 0.00290545, 0.00667712, 0.00730663, 0.00410813,
-            0.00073102, 0.00219296, 0.00527618, 0.00996585, 0.01123781,
-            0.00872816, 0.01165121, 0.02047945, 0.03681747, 0.0514379 ,
-            0.05137928, 0.03960042, 0.02821562, 0.01813349, 0.01201322,
-            0.01260964, 0.00900654, 0.00207905, 0.00456714, 0.00850599,
-            0.00788239, 0.00664407, 0.00824227, 0.00628301,
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(spec[:64, 0], expected))
-
-        spec = spectrogram(
-            waveform,
-            window_function(512, "hann"),
-            frame_length=512,
-            hop_length=128,
-            center=False,
-        )
-        self.assertEqual(spec.shape, (257, 728))
-
-        # fmt: off
-        expected = np.array([
-            0.00250445, 0.02161521, 0.06232229, 0.04339567, 0.00937727,
-            0.01080616, 0.00248685, 0.0095264 , 0.00727476, 0.0079152 ,
-            0.00839946, 0.00254932, 0.00716622, 0.005559  , 0.00272623,
-            0.00581774, 0.01896395, 0.01829788, 0.01020514, 0.01632692,
-            0.00870888, 0.02065827, 0.0136022 , 0.0132382 , 0.011827  ,
-            0.00194505, 0.0189979 , 0.026874  , 0.02194014, 0.01923883,
-            0.01621437, 0.00661967, 0.00289517, 0.00470257, 0.00957801,
-            0.00191455, 0.00431664, 0.00544359, 0.01126213, 0.00785778,
-            0.00423469, 0.01322504, 0.02226548, 0.02318576, 0.03428908,
-            0.03648811, 0.0202938 , 0.011902  , 0.03226198, 0.06347476,
-            0.01306318, 0.05308729, 0.05474771, 0.03127991, 0.00998512,
-            0.01449977, 0.01272741, 0.00868176, 0.00850386, 0.00313876,
-            0.00811857, 0.00538216, 0.00685749, 0.00535275,
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(spec[:64, 0], expected))
-
-    def test_spectrogram_shapes(self):
-        waveform = self._load_datasamples(1)[0]
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann"),
-            frame_length=400,
-            hop_length=128,
-            power=1.0,
-            center=True,
-            pad_mode="reflect",
-            onesided=True,
-        )
-        self.assertEqual(spec.shape, (201, 732))
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann"),
-            frame_length=400,
-            hop_length=128,
-            power=1.0,
-            center=False,
-            pad_mode="reflect",
-            onesided=True,
-        )
-        self.assertEqual(spec.shape, (201, 729))
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann"),
-            frame_length=400,
-            hop_length=128,
-            fft_length=512,
-            power=1.0,
-            center=True,
-            pad_mode="reflect",
-            onesided=True,
-        )
-        self.assertEqual(spec.shape, (257, 732))
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann", frame_length=512),
-            frame_length=512,
-            hop_length=64,
-            power=1.0,
-            center=True,
-            pad_mode="reflect",
-            onesided=False,
-        )
-        self.assertEqual(spec.shape, (512, 1464))
-
-        spec = spectrogram(
-            waveform,
-            window_function(512, "hann"),
-            frame_length=512,
-            hop_length=64,
-            power=1.0,
-            center=True,
-            pad_mode="reflect",
-            onesided=False,
-        )
-        self.assertEqual(spec.shape, (512, 1464))
-
-        spec = spectrogram(
-            waveform,
-            window_function(512, "hann"),
-            frame_length=512,
-            hop_length=512,
-            power=1.0,
-            center=True,
-            pad_mode="reflect",
-            onesided=False,
-        )
-        self.assertEqual(spec.shape, (512, 183))
-
-    def test_mel_spectrogram(self):
-        waveform = self._load_datasamples(1)[0]
-
-        mel_filters = mel_filter_bank(
-            num_frequency_bins=513,
-            num_mel_filters=13,
-            min_frequency=100,
-            max_frequency=4000,
-            sampling_rate=16000,
-            norm=None,
-            mel_scale="htk",
-        )
-        self.assertEqual(mel_filters.shape, (513, 13))
-
-        spec = spectrogram(
-            waveform,
-            window_function(800, "hann", frame_length=1024),
-            frame_length=1024,
-            hop_length=128,
-            power=2.0,
-        )
-        self.assertEqual(spec.shape, (513, 732))
-
-        spec = spectrogram(
-            waveform,
-            window_function(800, "hann", frame_length=1024),
-            frame_length=1024,
-            hop_length=128,
-            power=2.0,
-            mel_filters=mel_filters,
-        )
-        self.assertEqual(spec.shape, (13, 732))
-
-        # fmt: off
-        expected = np.array([
-            1.08027889e+02, 1.48080673e+01, 7.70758213e+00, 9.57676639e-01,
-            8.81639061e-02, 5.26073833e-02, 1.52736155e-02, 9.95350117e-03,
-            7.95364356e-03, 1.01148004e-02, 4.29241020e-03, 9.90708797e-03,
-            9.44153646e-04
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(spec[:, 300], expected))
-
-    def test_spectrogram_power(self):
-        waveform = self._load_datasamples(1)[0]
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann", frame_length=512),
-            frame_length=512,
-            hop_length=128,
-            power=None,
-        )
-        self.assertEqual(spec.shape, (257, 732))
-        self.assertEqual(spec.dtype, np.complex64)
-
-        # fmt: off
-        expected = np.array([
-             0.01452305+0.01820039j, -0.01737362-0.01641946j,
-             0.0121028 +0.01565081j, -0.02794554-0.03021514j,
-             0.04719803+0.04086519j, -0.04391563-0.02779365j,
-             0.05682834+0.01571325j, -0.08604821-0.02023657j,
-             0.07497991+0.0186641j , -0.06366091-0.00922475j,
-             0.11003416+0.0114788j , -0.13677941-0.01523552j,
-             0.10934535-0.00117226j, -0.11635598+0.02551187j,
-             0.14708674-0.03469823j, -0.1328196 +0.06034218j,
-             0.12667368-0.13973421j, -0.14764774+0.18912019j,
-             0.10235471-0.12181523j, -0.00773012+0.04730498j,
-            -0.01487191-0.07312611j, -0.02739162+0.09619419j,
-             0.02895459-0.05398273j,  0.01198589+0.05276592j,
-            -0.02117299-0.10123465j,  0.00666388+0.09526499j,
-            -0.01672773-0.05649684j,  0.02723125+0.05939891j,
-            -0.01879361-0.062954j  ,  0.03686557+0.04568823j,
-            -0.07394181-0.07949649j,  0.06238583+0.13905765j,
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(spec[64:96, 321], expected))
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann", frame_length=512),
-            frame_length=512,
-            hop_length=128,
-            power=1.0,
-        )
-        self.assertEqual(spec.shape, (257, 732))
-        self.assertEqual(spec.dtype, np.float64)
-
-        # fmt: off
-        expected = np.array([
-            0.02328461, 0.02390484, 0.01978448, 0.04115711, 0.0624309 ,
-            0.05197181, 0.05896072, 0.08839577, 0.07726794, 0.06432579,
-            0.11063128, 0.13762532, 0.10935163, 0.11911998, 0.15112405,
-            0.14588428, 0.18860507, 0.23992978, 0.15910825, 0.04793241,
-            0.07462307, 0.10001811, 0.06125769, 0.05411011, 0.10342509,
-            0.09549777, 0.05892122, 0.06534349, 0.06569936, 0.05870678,
-            0.10856833, 0.1524107 , 0.11463385, 0.05766969, 0.12385171,
-            0.14472842, 0.11978184, 0.10353675, 0.07244056, 0.03461861,
-            0.02624896, 0.02227475, 0.01238363, 0.00885281, 0.0110049 ,
-            0.00807005, 0.01033663, 0.01703181, 0.01445856, 0.00585615,
-            0.0132431 , 0.02754132, 0.01524478, 0.0204908 , 0.07453328,
-            0.10716327, 0.07195779, 0.08816078, 0.18340898, 0.16449876,
-            0.12322842, 0.1621659 , 0.12334293, 0.06033659,
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(spec[64:128, 321], expected))
-
-        spec = spectrogram(
-            waveform,
-            window_function(400, "hann", frame_length=512),
-            frame_length=512,
-            hop_length=128,
-            power=2.0,
-        )
-        self.assertEqual(spec.shape, (257, 732))
-        self.assertEqual(spec.dtype, np.float64)
-
-        # fmt: off
-        expected = np.array([
-            5.42173162e-04, 5.71441371e-04, 3.91425507e-04, 1.69390778e-03,
-            3.89761780e-03, 2.70106923e-03, 3.47636663e-03, 7.81381316e-03,
-            5.97033510e-03, 4.13780799e-03, 1.22392802e-02, 1.89407300e-02,
-            1.19577805e-02, 1.41895693e-02, 2.28384770e-02, 2.12822221e-02,
-            3.55718732e-02, 5.75663000e-02, 2.53154356e-02, 2.29751552e-03,
-            5.56860259e-03, 1.00036217e-02, 3.75250424e-03, 2.92790355e-03,
-            1.06967501e-02, 9.11982451e-03, 3.47171025e-03, 4.26977174e-03,
-            4.31640586e-03, 3.44648538e-03, 1.17870830e-02, 2.32290216e-02,
-            1.31409196e-02, 3.32579296e-03, 1.53392460e-02, 2.09463164e-02,
-            1.43476883e-02, 1.07198600e-02, 5.24763530e-03, 1.19844836e-03,
-            6.89007982e-04, 4.96164430e-04, 1.53354369e-04, 7.83722571e-05,
-            1.21107812e-04, 6.51257360e-05, 1.06845939e-04, 2.90082477e-04,
-            2.09049831e-04, 3.42945241e-05, 1.75379610e-04, 7.58524227e-04,
-            2.32403356e-04, 4.19872697e-04, 5.55520924e-03, 1.14839673e-02,
-            5.17792348e-03, 7.77232368e-03, 3.36388536e-02, 2.70598419e-02,
-            1.51852425e-02, 2.62977779e-02, 1.52134784e-02, 3.64050455e-03,
-        ])
-        # fmt: on
-        self.assertTrue(np.allclose(spec[64:128, 321], expected))
-
-    def test_power_to_db(self):
-        spectrogram = np.zeros((2, 3))
-        spectrogram[0, 0] = 2.0
-        spectrogram[0, 1] = 0.5
-        spectrogram[0, 2] = 0.707
-        spectrogram[1, 1] = 1.0
-
-        output = power_to_db(spectrogram, reference=1.0)
-        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-100.0, 0.0, -100.0]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = power_to_db(spectrogram, reference=2.0)
-        expected = np.array([[0.0, -6.02059991, -4.51610582], [-103.01029996, -3.01029996, -103.01029996]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = power_to_db(spectrogram, min_value=1e-6)
-        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-60.0, 0.0, -60.0]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = power_to_db(spectrogram, db_range=80)
-        expected = np.array([[3.01029996, -3.01029996, -1.50580586], [-76.98970004, 0.0, -76.98970004]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = power_to_db(spectrogram, reference=2.0, db_range=80)
-        expected = np.array([[0.0, -6.02059991, -4.51610582], [-80.0, -3.01029996, -80.0]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = power_to_db(spectrogram, reference=2.0, min_value=1e-6, db_range=80)
-        expected = np.array([[0.0, -6.02059991, -4.51610582], [-63.01029996, -3.01029996, -63.01029996]])
-        self.assertTrue(np.allclose(output, expected))
-
-        with pytest.raises(ValueError):
-            power_to_db(spectrogram, reference=0.0)
-        with pytest.raises(ValueError):
-            power_to_db(spectrogram, min_value=0.0)
-        with pytest.raises(ValueError):
-            power_to_db(spectrogram, db_range=-80)
-
-    def test_amplitude_to_db(self):
-        spectrogram = np.zeros((2, 3))
-        spectrogram[0, 0] = 2.0
-        spectrogram[0, 1] = 0.5
-        spectrogram[0, 2] = 0.707
-        spectrogram[1, 1] = 1.0
-
-        output = amplitude_to_db(spectrogram, reference=1.0)
-        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-100.0, 0.0, -100.0]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = amplitude_to_db(spectrogram, reference=2.0)
-        expected = np.array([[0.0, -12.04119983, -9.03221164], [-106.02059991, -6.02059991, -106.02059991]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = amplitude_to_db(spectrogram, min_value=1e-3)
-        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-60.0, 0.0, -60.0]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = amplitude_to_db(spectrogram, db_range=80)
-        expected = np.array([[6.02059991, -6.02059991, -3.01161172], [-73.97940009, 0.0, -73.97940009]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = amplitude_to_db(spectrogram, reference=2.0, db_range=80)
-        expected = np.array([[0.0, -12.04119983, -9.03221164], [-80.0, -6.02059991, -80.0]])
-        self.assertTrue(np.allclose(output, expected))
-
-        output = amplitude_to_db(spectrogram, reference=2.0, min_value=1e-3, db_range=80)
-        expected = np.array([[0.0, -12.04119983, -9.03221164], [-66.02059991, -6.02059991, -66.02059991]])
-        self.assertTrue(np.allclose(output, expected))
-
-        with pytest.raises(ValueError):
-            amplitude_to_db(spectrogram, reference=0.0)
-        with pytest.raises(ValueError):
-            amplitude_to_db(spectrogram, min_value=0.0)
-        with pytest.raises(ValueError):
-            amplitude_to_db(spectrogram, db_range=-80)
diff --git a/tests/transformers/tests/utils/test_backbone_utils.py b/tests/transformers/tests/utils/test_backbone_utils.py
deleted file mode 100644
index 66b7087da2..0000000000
--- a/tests/transformers/tests/utils/test_backbone_utils.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers.utils.backbone_utils import (
-    BackboneMixin,
-    get_aligned_output_features_output_indices,
-    verify_out_features_out_indices,
-)
-
-
-class BackboneUtilsTester(unittest.TestCase):
-    def test_get_aligned_output_features_output_indices(self):
-        stage_names = ["a", "b", "c"]
-
-        # Defaults to last layer if both are None
-        out_features, out_indices = get_aligned_output_features_output_indices(None, None, stage_names)
-        self.assertEqual(out_features, ["c"])
-        self.assertEqual(out_indices, [2])
-
-        # Out indices set to match out features
-        out_features, out_indices = get_aligned_output_features_output_indices(["a", "c"], None, stage_names)
-        self.assertEqual(out_features, ["a", "c"])
-        self.assertEqual(out_indices, [0, 2])
-
-        # Out features set to match out indices
-        out_features, out_indices = get_aligned_output_features_output_indices(None, [0, 2], stage_names)
-        self.assertEqual(out_features, ["a", "c"])
-        self.assertEqual(out_indices, [0, 2])
-
-        # Out features selected from negative indices
-        out_features, out_indices = get_aligned_output_features_output_indices(None, [-3, -1], stage_names)
-        self.assertEqual(out_features, ["a", "c"])
-        self.assertEqual(out_indices, [-3, -1])
-
-    def test_verify_out_features_out_indices(self):
-        # Stage names must be set
-        with self.assertRaises(ValueError):
-            verify_out_features_out_indices(["a", "b"], (0, 1), None)
-
-        # Out features must be a list
-        with self.assertRaises(ValueError):
-            verify_out_features_out_indices(("a", "b"), (0, 1), ["a", "b"])
-
-        # Out features must be a subset of stage names
-        with self.assertRaises(ValueError):
-            verify_out_features_out_indices(["a", "b"], (0, 1), ["a"])
-
-        # Out indices must be a list or tuple
-        with self.assertRaises(ValueError):
-            verify_out_features_out_indices(None, 0, ["a", "b"])
-
-        # Out indices must be a subset of stage names
-        with self.assertRaises(ValueError):
-            verify_out_features_out_indices(None, (0, 1), ["a"])
-
-        # Out features and out indices must be the same length
-        with self.assertRaises(ValueError):
-            verify_out_features_out_indices(["a", "b"], (0,), ["a", "b", "c"])
-
-        # Out features should match out indices
-        with self.assertRaises(ValueError):
-            verify_out_features_out_indices(["a", "b"], (0, 2), ["a", "b", "c"])
-
-        # Out features and out indices should be in order
-        with self.assertRaises(ValueError):
-            verify_out_features_out_indices(["b", "a"], (0, 1), ["a", "b"])
-
-        # Check passes with valid inputs
-        verify_out_features_out_indices(["a", "b", "d"], (0, 1, -1), ["a", "b", "c", "d"])
-
-    def test_backbone_mixin(self):
-        backbone = BackboneMixin()
-
-        backbone.stage_names = ["a", "b", "c"]
-        backbone._out_features = ["a", "c"]
-        backbone._out_indices = [0, 2]
-
-        # Check that the output features and indices are set correctly
-        self.assertEqual(backbone.out_features, ["a", "c"])
-        self.assertEqual(backbone.out_indices, [0, 2])
-
-        # Check out features and indices are updated correctly
-        backbone.out_features = ["a", "b"]
-        self.assertEqual(backbone.out_features, ["a", "b"])
-        self.assertEqual(backbone.out_indices, [0, 1])
-
-        backbone.out_indices = [-3, -1]
-        self.assertEqual(backbone.out_features, ["a", "c"])
-        self.assertEqual(backbone.out_indices, [-3, -1])
diff --git a/tests/transformers/tests/utils/test_cli.py b/tests/transformers/tests/utils/test_cli.py
deleted file mode 100644
index fc7b8ebb5e..0000000000
--- a/tests/transformers/tests/utils/test_cli.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-import unittest
-from unittest.mock import patch
-
-from transformers.testing_utils import CaptureStd, is_pt_tf_cross_test, require_torch
-
-
-class CLITest(unittest.TestCase):
-    @patch("sys.argv", ["fakeprogrampath", "env"])
-    def test_cli_env(self):
-        # test transformers-cli env
-        import transformers.commands.transformers_cli
-
-        with CaptureStd() as cs:
-            transformers.commands.transformers_cli.main()
-        self.assertIn("Python version", cs.out)
-        self.assertIn("Platform", cs.out)
-        self.assertIn("Using distributed or parallel set-up in script?", cs.out)
-
-    @is_pt_tf_cross_test
-    @patch(
-        "sys.argv", ["fakeprogrampath", "pt-to-tf", "--model-name", "hf-internal-testing/tiny-random-gptj", "--no-pr"]
-    )
-    def test_cli_pt_to_tf(self):
-        import transformers.commands.transformers_cli
-
-        shutil.rmtree("/tmp/hf-internal-testing/tiny-random-gptj", ignore_errors=True)  # cleans potential past runs
-        transformers.commands.transformers_cli.main()
-
-        # The original repo has no TF weights -- if they exist, they were created by the CLI
-        self.assertTrue(os.path.exists("/tmp/hf-internal-testing/tiny-random-gptj/tf_model.h5"))
-
-    @require_torch
-    @patch("sys.argv", ["fakeprogrampath", "download", "hf-internal-testing/tiny-random-gptj", "--cache-dir", "/tmp"])
-    def test_cli_download(self):
-        import transformers.commands.transformers_cli
-
-        # # remove any previously downloaded model to start clean
-        shutil.rmtree("/tmp/models--hf-internal-testing--tiny-random-gptj", ignore_errors=True)
-
-        # run the command
-        transformers.commands.transformers_cli.main()
-
-        # check if the model files are downloaded correctly on /tmp/models--hf-internal-testing--tiny-random-gptj
-        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/blobs"))
-        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/refs"))
-        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--tiny-random-gptj/snapshots"))
-
-    @require_torch
-    @patch(
-        "sys.argv",
-        [
-            "fakeprogrampath",
-            "download",
-            "hf-internal-testing/test_dynamic_model_with_tokenizer",
-            "--trust-remote-code",
-            "--cache-dir",
-            "/tmp",
-        ],
-    )
-    def test_cli_download_trust_remote(self):
-        import transformers.commands.transformers_cli
-
-        # # remove any previously downloaded model to start clean
-        shutil.rmtree("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer", ignore_errors=True)
-
-        # run the command
-        transformers.commands.transformers_cli.main()
-
-        # check if the model files are downloaded correctly on /tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer
-        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/blobs"))
-        self.assertTrue(os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/refs"))
-        self.assertTrue(
-            os.path.exists("/tmp/models--hf-internal-testing--test_dynamic_model_with_tokenizer/snapshots")
-        )
diff --git a/tests/transformers/tests/utils/test_convert_slow_tokenizer.py b/tests/transformers/tests/utils/test_convert_slow_tokenizer.py
deleted file mode 100644
index edeb06c390..0000000000
--- a/tests/transformers/tests/utils/test_convert_slow_tokenizer.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import unittest
-import warnings
-from dataclasses import dataclass
-
-from transformers.convert_slow_tokenizer import SpmConverter
-from transformers.testing_utils import get_tests_dir
-
-
-@dataclass
-class FakeOriginalTokenizer:
-    vocab_file: str
-
-
-class ConvertSlowTokenizerTest(unittest.TestCase):
-    def test_spm_converter_bytefallback_warning(self):
-        spm_model_file_without_bytefallback = get_tests_dir("fixtures/test_sentencepiece.model")
-        spm_model_file_with_bytefallback = get_tests_dir("fixtures/test_sentencepiece_with_bytefallback.model")
-
-        original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
-
-        with warnings.catch_warnings(record=True) as w:
-            _ = SpmConverter(original_tokenizer_without_bytefallback)
-        self.assertEqual(len(w), 0)
-
-        original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
-
-        with warnings.catch_warnings(record=True) as w:
-            _ = SpmConverter(original_tokenizer_with_bytefallback)
-        self.assertEqual(len(w), 1)
-
-        self.assertIn(
-            "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
-            " which is not implemented in the fast tokenizers.",
-            str(w[0].message),
-        )
diff --git a/tests/transformers/tests/utils/test_doc_samples.py b/tests/transformers/tests/utils/test_doc_samples.py
deleted file mode 100644
index 84c5a4d2bf..0000000000
--- a/tests/transformers/tests/utils/test_doc_samples.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import doctest
-import logging
-import os
-import unittest
-from pathlib import Path
-from typing import List, Union
-
-import transformers
-from transformers.testing_utils import require_tf, require_torch, slow
-
-
-logger = logging.getLogger()
-
-
-@unittest.skip("Temporarily disable the doc tests.")
-@require_torch
-@require_tf
-@slow
-class TestCodeExamples(unittest.TestCase):
-    def analyze_directory(
-        self,
-        directory: Path,
-        identifier: Union[str, None] = None,
-        ignore_files: Union[List[str], None] = None,
-        n_identifier: Union[str, List[str], None] = None,
-        only_modules: bool = True,
-    ):
-        """
-        Runs through the specific directory, looking for the files identified with `identifier`. Executes
-        the doctests in those files
-
-        Args:
-            directory (`Path`): Directory containing the files
-            identifier (`str`): Will parse files containing this
-            ignore_files (`List[str]`): List of files to skip
-            n_identifier (`str` or `List[str]`): Will not parse files containing this/these identifiers.
-            only_modules (`bool`): Whether to only analyze modules
-        """
-        files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
-
-        if identifier is not None:
-            files = [file for file in files if identifier in file]
-
-        if n_identifier is not None:
-            if isinstance(n_identifier, List):
-                for n_ in n_identifier:
-                    files = [file for file in files if n_ not in file]
-            else:
-                files = [file for file in files if n_identifier not in file]
-
-        ignore_files = ignore_files or []
-        ignore_files.append("__init__.py")
-        files = [file for file in files if file not in ignore_files]
-
-        for file in files:
-            # Open all files
-            print("Testing", file)
-
-            if only_modules:
-                module_identifier = file.split(".")[0]
-                try:
-                    module_identifier = getattr(transformers, module_identifier)
-                    suite = doctest.DocTestSuite(module_identifier)
-                    result = unittest.TextTestRunner().run(suite)
-                    self.assertIs(len(result.failures), 0)
-                except AttributeError:
-                    logger.info(f"{module_identifier} is not a module.")
-            else:
-                result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
-                self.assertIs(result.failed, 0)
-
-    def test_modeling_examples(self):
-        transformers_directory = Path("src/transformers")
-        files = "modeling"
-        ignore_files = [
-            "modeling_ctrl.py",
-            "modeling_tf_ctrl.py",
-        ]
-        self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
-
-    def test_tokenization_examples(self):
-        transformers_directory = Path("src/transformers")
-        files = "tokenization"
-        self.analyze_directory(transformers_directory, identifier=files)
-
-    def test_configuration_examples(self):
-        transformers_directory = Path("src/transformers")
-        files = "configuration"
-        self.analyze_directory(transformers_directory, identifier=files)
-
-    def test_remaining_examples(self):
-        transformers_directory = Path("src/transformers")
-        n_identifiers = ["configuration", "modeling", "tokenization"]
-        self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
-
-    def test_doc_sources(self):
-        doc_source_directory = Path("docs/source")
-        ignore_files = ["favicon.ico"]
-        self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)
diff --git a/tests/transformers/tests/utils/test_dynamic_module_utils.py b/tests/transformers/tests/utils/test_dynamic_module_utils.py
deleted file mode 100644
index c9b2694937..0000000000
--- a/tests/transformers/tests/utils/test_dynamic_module_utils.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pytest
-from transformers.dynamic_module_utils import get_imports
-
-
-TOP_LEVEL_IMPORT = """
-import os
-"""
-
-IMPORT_IN_FUNCTION = """
-def foo():
-    import os
-    return False
-"""
-
-DEEPLY_NESTED_IMPORT = """
-def foo():
-    def bar():
-        if True:
-            import os
-        return False
-    return bar()
-"""
-
-TOP_LEVEL_TRY_IMPORT = """
-import os
-
-try:
-    import bar
-except ImportError:
-    raise ValueError()
-"""
-
-TRY_IMPORT_IN_FUNCTION = """
-import os
-
-def foo():
-    try:
-        import bar
-    except ImportError:
-        raise ValueError()
-"""
-
-MULTIPLE_EXCEPTS_IMPORT = """
-import os
-
-try:
-    import bar
-except (ImportError, AttributeError):
-    raise ValueError()
-"""
-
-EXCEPT_AS_IMPORT = """
-import os
-
-try:
-    import bar
-except ImportError as e:
-    raise ValueError()
-"""
-
-GENERIC_EXCEPT_IMPORT = """
-import os
-
-try:
-    import bar
-except:
-    raise ValueError()
-"""
-
-MULTILINE_TRY_IMPORT = """
-import os
-
-try:
-    import bar
-    import baz
-except ImportError:
-    raise ValueError()
-"""
-
-MULTILINE_BOTH_IMPORT = """
-import os
-
-try:
-    import bar
-    import baz
-except ImportError:
-    x = 1
-    raise ValueError()
-"""
-
-CASES = [
-    TOP_LEVEL_IMPORT,
-    IMPORT_IN_FUNCTION,
-    DEEPLY_NESTED_IMPORT,
-    TOP_LEVEL_TRY_IMPORT,
-    GENERIC_EXCEPT_IMPORT,
-    MULTILINE_TRY_IMPORT,
-    MULTILINE_BOTH_IMPORT,
-    MULTIPLE_EXCEPTS_IMPORT,
-    EXCEPT_AS_IMPORT,
-    TRY_IMPORT_IN_FUNCTION,
-]
-
-
-@pytest.mark.parametrize("case", CASES)
-def test_import_parsing(tmp_path, case):
-    tmp_file_path = os.path.join(tmp_path, "test_file.py")
-    with open(tmp_file_path, "w") as _tmp_file:
-        _tmp_file.write(case)
-
-    parsed_imports = get_imports(tmp_file_path)
-    assert parsed_imports == ["os"]
diff --git a/tests/transformers/tests/utils/test_file_utils.py b/tests/transformers/tests/utils/test_file_utils.py
deleted file mode 100644
index 1cbde0fb18..0000000000
--- a/tests/transformers/tests/utils/test_file_utils.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import importlib
-import io
-import unittest
-
-import transformers
-
-# Try to import everything from transformers to ensure every object can be loaded.
-from transformers import *  # noqa F406
-from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_flax, require_tf, require_torch
-from transformers.utils import ContextManagers, find_labels, is_flax_available, is_tf_available, is_torch_available
-
-
-if is_torch_available():
-    from transformers import BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification
-
-if is_tf_available():
-    from transformers import TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification
-
-if is_flax_available():
-    from transformers import FlaxBertForPreTraining, FlaxBertForQuestionAnswering, FlaxBertForSequenceClassification
-
-
-MODEL_ID = DUMMY_UNKNOWN_IDENTIFIER
-# An actual model hosted on huggingface.co
-
-REVISION_ID_DEFAULT = "main"
-# Default branch name
-REVISION_ID_ONE_SPECIFIC_COMMIT = "f2c752cfc5c0ab6f4bdec59acea69eefbee381c2"
-# One particular commit (not the top of `main`)
-REVISION_ID_INVALID = "aaaaaaa"
-# This commit does not exist, so we should 404.
-
-PINNED_SHA1 = "d9e9f15bc825e4b2c9249e9578f884bbcb5e3684"
-# Sha-1 of config.json on the top of `main`, for checking purposes
-PINNED_SHA256 = "4b243c475af8d0a7754e87d7d096c92e5199ec2fe168a2ee7998e3b8e9bcb1d3"
-# Sha-256 of pytorch_model.bin on the top of `main`, for checking purposes
-
-
-# Dummy contexts to test `ContextManagers`
-@contextlib.contextmanager
-def context_en():
-    print("Welcome!")
-    yield
-    print("Bye!")
-
-
-@contextlib.contextmanager
-def context_fr():
-    print("Bonjour!")
-    yield
-    print("Au revoir!")
-
-
-class TestImportMechanisms(unittest.TestCase):
-    def test_module_spec_available(self):
-        # If the spec is missing, importlib would not be able to import the module dynamically.
-        assert transformers.__spec__ is not None
-        assert importlib.util.find_spec("transformers") is not None
-
-
-class GenericUtilTests(unittest.TestCase):
-    @unittest.mock.patch("sys.stdout", new_callable=io.StringIO)
-    def test_context_managers_no_context(self, mock_stdout):
-        with ContextManagers([]):
-            print("Transformers are awesome!")
-        # The print statement adds a new line at the end of the output
-        self.assertEqual(mock_stdout.getvalue(), "Transformers are awesome!\n")
-
-    @unittest.mock.patch("sys.stdout", new_callable=io.StringIO)
-    def test_context_managers_one_context(self, mock_stdout):
-        with ContextManagers([context_en()]):
-            print("Transformers are awesome!")
-        # The output should be wrapped with an English welcome and goodbye
-        self.assertEqual(mock_stdout.getvalue(), "Welcome!\nTransformers are awesome!\nBye!\n")
-
-    @unittest.mock.patch("sys.stdout", new_callable=io.StringIO)
-    def test_context_managers_two_context(self, mock_stdout):
-        with ContextManagers([context_fr(), context_en()]):
-            print("Transformers are awesome!")
-        # The output should be wrapped with an English and French welcome and goodbye
-        self.assertEqual(mock_stdout.getvalue(), "Bonjour!\nWelcome!\nTransformers are awesome!\nBye!\nAu revoir!\n")
-
-    @require_torch
-    def test_find_labels_pt(self):
-        self.assertEqual(find_labels(BertForSequenceClassification), ["labels"])
-        self.assertEqual(find_labels(BertForPreTraining), ["labels", "next_sentence_label"])
-        self.assertEqual(find_labels(BertForQuestionAnswering), ["start_positions", "end_positions"])
-
-        # find_labels works regardless of the class name (it detects the framework through inheritance)
-        class DummyModel(BertForSequenceClassification):
-            pass
-
-        self.assertEqual(find_labels(DummyModel), ["labels"])
-
-    @require_tf
-    def test_find_labels_tf(self):
-        self.assertEqual(find_labels(TFBertForSequenceClassification), ["labels"])
-        self.assertEqual(find_labels(TFBertForPreTraining), ["labels", "next_sentence_label"])
-        self.assertEqual(find_labels(TFBertForQuestionAnswering), ["start_positions", "end_positions"])
-
-        # find_labels works regardless of the class name (it detects the framework through inheritance)
-        class DummyModel(TFBertForSequenceClassification):
-            pass
-
-        self.assertEqual(find_labels(DummyModel), ["labels"])
-
-    @require_flax
-    def test_find_labels_flax(self):
-        # Flax models don't have labels
-        self.assertEqual(find_labels(FlaxBertForSequenceClassification), [])
-        self.assertEqual(find_labels(FlaxBertForPreTraining), [])
-        self.assertEqual(find_labels(FlaxBertForQuestionAnswering), [])
-
-        # find_labels works regardless of the class name (it detects the framework through inheritance)
-        class DummyModel(FlaxBertForSequenceClassification):
-            pass
-
-        self.assertEqual(find_labels(DummyModel), [])
diff --git a/tests/transformers/tests/utils/test_generic.py b/tests/transformers/tests/utils/test_generic.py
deleted file mode 100644
index e9001289d4..0000000000
--- a/tests/transformers/tests/utils/test_generic.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from transformers.testing_utils import require_flax, require_tf, require_torch
-from transformers.utils import (
-    expand_dims,
-    flatten_dict,
-    is_flax_available,
-    is_tf_available,
-    is_torch_available,
-    reshape,
-    squeeze,
-    transpose,
-)
-
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-if is_tf_available():
-    import tensorflow as tf
-
-if is_torch_available():
-    import torch
-
-
-class GenericTester(unittest.TestCase):
-    def test_flatten_dict(self):
-        input_dict = {
-            "task_specific_params": {
-                "summarization": {"length_penalty": 1.0, "max_length": 128, "min_length": 12, "num_beams": 4},
-                "summarization_cnn": {"length_penalty": 2.0, "max_length": 142, "min_length": 56, "num_beams": 4},
-                "summarization_xsum": {"length_penalty": 1.0, "max_length": 62, "min_length": 11, "num_beams": 6},
-            }
-        }
-        expected_dict = {
-            "task_specific_params.summarization.length_penalty": 1.0,
-            "task_specific_params.summarization.max_length": 128,
-            "task_specific_params.summarization.min_length": 12,
-            "task_specific_params.summarization.num_beams": 4,
-            "task_specific_params.summarization_cnn.length_penalty": 2.0,
-            "task_specific_params.summarization_cnn.max_length": 142,
-            "task_specific_params.summarization_cnn.min_length": 56,
-            "task_specific_params.summarization_cnn.num_beams": 4,
-            "task_specific_params.summarization_xsum.length_penalty": 1.0,
-            "task_specific_params.summarization_xsum.max_length": 62,
-            "task_specific_params.summarization_xsum.min_length": 11,
-            "task_specific_params.summarization_xsum.num_beams": 6,
-        }
-
-        self.assertEqual(flatten_dict(input_dict), expected_dict)
-
-    def test_transpose_numpy(self):
-        x = np.random.randn(3, 4)
-        self.assertTrue(np.allclose(transpose(x), x.transpose()))
-
-        x = np.random.randn(3, 4, 5)
-        self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), x.transpose((1, 2, 0))))
-
-    @require_torch
-    def test_transpose_torch(self):
-        x = np.random.randn(3, 4)
-        t = torch.tensor(x)
-        self.assertTrue(np.allclose(transpose(x), transpose(t).numpy()))
-
-        x = np.random.randn(3, 4, 5)
-        t = torch.tensor(x)
-        self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), transpose(t, axes=(1, 2, 0)).numpy()))
-
-    @require_tf
-    def test_transpose_tf(self):
-        x = np.random.randn(3, 4)
-        t = tf.constant(x)
-        self.assertTrue(np.allclose(transpose(x), transpose(t).numpy()))
-
-        x = np.random.randn(3, 4, 5)
-        t = tf.constant(x)
-        self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), transpose(t, axes=(1, 2, 0)).numpy()))
-
-    @require_flax
-    def test_transpose_flax(self):
-        x = np.random.randn(3, 4)
-        t = jnp.array(x)
-        self.assertTrue(np.allclose(transpose(x), np.asarray(transpose(t))))
-
-        x = np.random.randn(3, 4, 5)
-        t = jnp.array(x)
-        self.assertTrue(np.allclose(transpose(x, axes=(1, 2, 0)), np.asarray(transpose(t, axes=(1, 2, 0)))))
-
-    def test_reshape_numpy(self):
-        x = np.random.randn(3, 4)
-        self.assertTrue(np.allclose(reshape(x, (4, 3)), np.reshape(x, (4, 3))))
-
-        x = np.random.randn(3, 4, 5)
-        self.assertTrue(np.allclose(reshape(x, (12, 5)), np.reshape(x, (12, 5))))
-
-    @require_torch
-    def test_reshape_torch(self):
-        x = np.random.randn(3, 4)
-        t = torch.tensor(x)
-        self.assertTrue(np.allclose(reshape(x, (4, 3)), reshape(t, (4, 3)).numpy()))
-
-        x = np.random.randn(3, 4, 5)
-        t = torch.tensor(x)
-        self.assertTrue(np.allclose(reshape(x, (12, 5)), reshape(t, (12, 5)).numpy()))
-
-    @require_tf
-    def test_reshape_tf(self):
-        x = np.random.randn(3, 4)
-        t = tf.constant(x)
-        self.assertTrue(np.allclose(reshape(x, (4, 3)), reshape(t, (4, 3)).numpy()))
-
-        x = np.random.randn(3, 4, 5)
-        t = tf.constant(x)
-        self.assertTrue(np.allclose(reshape(x, (12, 5)), reshape(t, (12, 5)).numpy()))
-
-    @require_flax
-    def test_reshape_flax(self):
-        x = np.random.randn(3, 4)
-        t = jnp.array(x)
-        self.assertTrue(np.allclose(reshape(x, (4, 3)), np.asarray(reshape(t, (4, 3)))))
-
-        x = np.random.randn(3, 4, 5)
-        t = jnp.array(x)
-        self.assertTrue(np.allclose(reshape(x, (12, 5)), np.asarray(reshape(t, (12, 5)))))
-
-    def test_squeeze_numpy(self):
-        x = np.random.randn(1, 3, 4)
-        self.assertTrue(np.allclose(squeeze(x), np.squeeze(x)))
-
-        x = np.random.randn(1, 4, 1, 5)
-        self.assertTrue(np.allclose(squeeze(x, axis=2), np.squeeze(x, axis=2)))
-
-    @require_torch
-    def test_squeeze_torch(self):
-        x = np.random.randn(1, 3, 4)
-        t = torch.tensor(x)
-        self.assertTrue(np.allclose(squeeze(x), squeeze(t).numpy()))
-
-        x = np.random.randn(1, 4, 1, 5)
-        t = torch.tensor(x)
-        self.assertTrue(np.allclose(squeeze(x, axis=2), squeeze(t, axis=2).numpy()))
-
-    @require_tf
-    def test_squeeze_tf(self):
-        x = np.random.randn(1, 3, 4)
-        t = tf.constant(x)
-        self.assertTrue(np.allclose(squeeze(x), squeeze(t).numpy()))
-
-        x = np.random.randn(1, 4, 1, 5)
-        t = tf.constant(x)
-        self.assertTrue(np.allclose(squeeze(x, axis=2), squeeze(t, axis=2).numpy()))
-
-    @require_flax
-    def test_squeeze_flax(self):
-        x = np.random.randn(1, 3, 4)
-        t = jnp.array(x)
-        self.assertTrue(np.allclose(squeeze(x), np.asarray(squeeze(t))))
-
-        x = np.random.randn(1, 4, 1, 5)
-        t = jnp.array(x)
-        self.assertTrue(np.allclose(squeeze(x, axis=2), np.asarray(squeeze(t, axis=2))))
-
-    def test_expand_dims_numpy(self):
-        x = np.random.randn(3, 4)
-        self.assertTrue(np.allclose(expand_dims(x, axis=1), np.expand_dims(x, axis=1)))
-
-    @require_torch
-    def test_expand_dims_torch(self):
-        x = np.random.randn(3, 4)
-        t = torch.tensor(x)
-        self.assertTrue(np.allclose(expand_dims(x, axis=1), expand_dims(t, axis=1).numpy()))
-
-    @require_tf
-    def test_expand_dims_tf(self):
-        x = np.random.randn(3, 4)
-        t = tf.constant(x)
-        self.assertTrue(np.allclose(expand_dims(x, axis=1), expand_dims(t, axis=1).numpy()))
-
-    @require_flax
-    def test_expand_dims_flax(self):
-        x = np.random.randn(3, 4)
-        t = jnp.array(x)
-        self.assertTrue(np.allclose(expand_dims(x, axis=1), np.asarray(expand_dims(t, axis=1))))
diff --git a/tests/transformers/tests/utils/test_hf_argparser.py b/tests/transformers/tests/utils/test_hf_argparser.py
deleted file mode 100644
index 4d0997861d..0000000000
--- a/tests/transformers/tests/utils/test_hf_argparser.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import os
-import sys
-import tempfile
-import unittest
-from argparse import Namespace
-from dataclasses import dataclass, field
-from enum import Enum
-from pathlib import Path
-from typing import List, Literal, Optional
-
-import yaml
-from transformers import HfArgumentParser, TrainingArguments
-from transformers.hf_argparser import make_choice_type_function, string_to_bool
-
-
-# Since Python 3.10, we can use the builtin `|` operator for Union types
-# See PEP 604: https://peps.python.org/pep-0604
-is_python_no_less_than_3_10 = sys.version_info >= (3, 10)
-
-
-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-@dataclass
-class BasicExample:
-    foo: int
-    bar: float
-    baz: str
-    flag: bool
-
-
-@dataclass
-class WithDefaultExample:
-    foo: int = 42
-    baz: str = field(default="toto", metadata={"help": "help message"})
-
-
-@dataclass
-class WithDefaultBoolExample:
-    foo: bool = False
-    baz: bool = True
-    opt: Optional[bool] = None
-
-
-class BasicEnum(Enum):
-    titi = "titi"
-    toto = "toto"
-
-
-class MixedTypeEnum(Enum):
-    titi = "titi"
-    toto = "toto"
-    fourtytwo = 42
-
-
-@dataclass
-class EnumExample:
-    foo: BasicEnum = "toto"
-
-    def __post_init__(self):
-        self.foo = BasicEnum(self.foo)
-
-
-@dataclass
-class MixedTypeEnumExample:
-    foo: MixedTypeEnum = "toto"
-
-    def __post_init__(self):
-        self.foo = MixedTypeEnum(self.foo)
-
-
-@dataclass
-class OptionalExample:
-    foo: Optional[int] = None
-    bar: Optional[float] = field(default=None, metadata={"help": "help message"})
-    baz: Optional[str] = None
-    ces: Optional[List[str]] = list_field(default=[])
-    des: Optional[List[int]] = list_field(default=[])
-
-
-@dataclass
-class ListExample:
-    foo_int: List[int] = list_field(default=[])
-    bar_int: List[int] = list_field(default=[1, 2, 3])
-    foo_str: List[str] = list_field(default=["Hallo", "Bonjour", "Hello"])
-    foo_float: List[float] = list_field(default=[0.1, 0.2, 0.3])
-
-
-@dataclass
-class RequiredExample:
-    required_list: List[int] = field()
-    required_str: str = field()
-    required_enum: BasicEnum = field()
-
-    def __post_init__(self):
-        self.required_enum = BasicEnum(self.required_enum)
-
-
-@dataclass
-class StringLiteralAnnotationExample:
-    foo: int
-    required_enum: "BasicEnum" = field()
-    opt: "Optional[bool]" = None
-    baz: "str" = field(default="toto", metadata={"help": "help message"})
-    foo_str: "List[str]" = list_field(default=["Hallo", "Bonjour", "Hello"])
-
-
-if is_python_no_less_than_3_10:
-
-    @dataclass
-    class WithDefaultBoolExamplePep604:
-        foo: bool = False
-        baz: bool = True
-        opt: bool | None = None
-
-    @dataclass
-    class OptionalExamplePep604:
-        foo: int | None = None
-        bar: float | None = field(default=None, metadata={"help": "help message"})
-        baz: str | None = None
-        ces: list[str] | None = list_field(default=[])
-        des: list[int] | None = list_field(default=[])
-
-
-class HfArgumentParserTest(unittest.TestCase):
-    def argparsersEqual(self, a: argparse.ArgumentParser, b: argparse.ArgumentParser):
-        """
-        Small helper to check pseudo-equality of parsed arguments on `ArgumentParser` instances.
-        """
-        self.assertEqual(len(a._actions), len(b._actions))
-        for x, y in zip(a._actions, b._actions):
-            xx = {k: v for k, v in vars(x).items() if k != "container"}
-            yy = {k: v for k, v in vars(y).items() if k != "container"}
-
-            # Choices with mixed type have custom function as "type"
-            # So we need to compare results directly for equality
-            if xx.get("choices", None) and yy.get("choices", None):
-                for expected_choice in yy["choices"] + xx["choices"]:
-                    self.assertEqual(xx["type"](expected_choice), yy["type"](expected_choice))
-                del xx["type"], yy["type"]
-
-            self.assertEqual(xx, yy)
-
-    def test_basic(self):
-        parser = HfArgumentParser(BasicExample)
-
-        expected = argparse.ArgumentParser()
-        expected.add_argument("--foo", type=int, required=True)
-        expected.add_argument("--bar", type=float, required=True)
-        expected.add_argument("--baz", type=str, required=True)
-        expected.add_argument("--flag", type=string_to_bool, default=False, const=True, nargs="?")
-        self.argparsersEqual(parser, expected)
-
-        args = ["--foo", "1", "--baz", "quux", "--bar", "0.5"]
-        (example,) = parser.parse_args_into_dataclasses(args, look_for_args_file=False)
-        self.assertFalse(example.flag)
-
-    def test_with_default(self):
-        parser = HfArgumentParser(WithDefaultExample)
-
-        expected = argparse.ArgumentParser()
-        expected.add_argument("--foo", default=42, type=int)
-        expected.add_argument("--baz", default="toto", type=str, help="help message")
-        self.argparsersEqual(parser, expected)
-
-    def test_with_default_bool(self):
-        expected = argparse.ArgumentParser()
-        expected.add_argument("--foo", type=string_to_bool, default=False, const=True, nargs="?")
-        expected.add_argument("--baz", type=string_to_bool, default=True, const=True, nargs="?")
-        # A boolean no_* argument always has to come after its "default: True" regular counter-part
-        # and its default must be set to False
-        expected.add_argument("--no_baz", action="store_false", default=False, dest="baz")
-        expected.add_argument("--opt", type=string_to_bool, default=None)
-
-        dataclass_types = [WithDefaultBoolExample]
-        if is_python_no_less_than_3_10:
-            dataclass_types.append(WithDefaultBoolExamplePep604)
-
-        for dataclass_type in dataclass_types:
-            parser = HfArgumentParser(dataclass_type)
-            self.argparsersEqual(parser, expected)
-
-            args = parser.parse_args([])
-            self.assertEqual(args, Namespace(foo=False, baz=True, opt=None))
-
-            args = parser.parse_args(["--foo", "--no_baz"])
-            self.assertEqual(args, Namespace(foo=True, baz=False, opt=None))
-
-            args = parser.parse_args(["--foo", "--baz"])
-            self.assertEqual(args, Namespace(foo=True, baz=True, opt=None))
-
-            args = parser.parse_args(["--foo", "True", "--baz", "True", "--opt", "True"])
-            self.assertEqual(args, Namespace(foo=True, baz=True, opt=True))
-
-            args = parser.parse_args(["--foo", "False", "--baz", "False", "--opt", "False"])
-            self.assertEqual(args, Namespace(foo=False, baz=False, opt=False))
-
-    def test_with_enum(self):
-        parser = HfArgumentParser(MixedTypeEnumExample)
-
-        expected = argparse.ArgumentParser()
-        expected.add_argument(
-            "--foo",
-            default="toto",
-            choices=["titi", "toto", 42],
-            type=make_choice_type_function(["titi", "toto", 42]),
-        )
-        self.argparsersEqual(parser, expected)
-
-        args = parser.parse_args([])
-        self.assertEqual(args.foo, "toto")
-        enum_ex = parser.parse_args_into_dataclasses([])[0]
-        self.assertEqual(enum_ex.foo, MixedTypeEnum.toto)
-
-        args = parser.parse_args(["--foo", "titi"])
-        self.assertEqual(args.foo, "titi")
-        enum_ex = parser.parse_args_into_dataclasses(["--foo", "titi"])[0]
-        self.assertEqual(enum_ex.foo, MixedTypeEnum.titi)
-
-        args = parser.parse_args(["--foo", "42"])
-        self.assertEqual(args.foo, 42)
-        enum_ex = parser.parse_args_into_dataclasses(["--foo", "42"])[0]
-        self.assertEqual(enum_ex.foo, MixedTypeEnum.fourtytwo)
-
-    def test_with_literal(self):
-        @dataclass
-        class LiteralExample:
-            foo: Literal["titi", "toto", 42] = "toto"
-
-        parser = HfArgumentParser(LiteralExample)
-
-        expected = argparse.ArgumentParser()
-        expected.add_argument(
-            "--foo",
-            default="toto",
-            choices=("titi", "toto", 42),
-            type=make_choice_type_function(["titi", "toto", 42]),
-        )
-        self.argparsersEqual(parser, expected)
-
-        args = parser.parse_args([])
-        self.assertEqual(args.foo, "toto")
-
-        args = parser.parse_args(["--foo", "titi"])
-        self.assertEqual(args.foo, "titi")
-
-        args = parser.parse_args(["--foo", "42"])
-        self.assertEqual(args.foo, 42)
-
-    def test_with_list(self):
-        parser = HfArgumentParser(ListExample)
-
-        expected = argparse.ArgumentParser()
-        expected.add_argument("--foo_int", nargs="+", default=[], type=int)
-        expected.add_argument("--bar_int", nargs="+", default=[1, 2, 3], type=int)
-        expected.add_argument("--foo_str", nargs="+", default=["Hallo", "Bonjour", "Hello"], type=str)
-        expected.add_argument("--foo_float", nargs="+", default=[0.1, 0.2, 0.3], type=float)
-
-        self.argparsersEqual(parser, expected)
-
-        args = parser.parse_args([])
-        self.assertEqual(
-            args,
-            Namespace(foo_int=[], bar_int=[1, 2, 3], foo_str=["Hallo", "Bonjour", "Hello"], foo_float=[0.1, 0.2, 0.3]),
-        )
-
-        args = parser.parse_args("--foo_int 1 --bar_int 2 3 --foo_str a b c --foo_float 0.1 0.7".split())
-        self.assertEqual(args, Namespace(foo_int=[1], bar_int=[2, 3], foo_str=["a", "b", "c"], foo_float=[0.1, 0.7]))
-
-    def test_with_optional(self):
-        expected = argparse.ArgumentParser()
-        expected.add_argument("--foo", default=None, type=int)
-        expected.add_argument("--bar", default=None, type=float, help="help message")
-        expected.add_argument("--baz", default=None, type=str)
-        expected.add_argument("--ces", nargs="+", default=[], type=str)
-        expected.add_argument("--des", nargs="+", default=[], type=int)
-
-        dataclass_types = [OptionalExample]
-        if is_python_no_less_than_3_10:
-            dataclass_types.append(OptionalExamplePep604)
-
-        for dataclass_type in dataclass_types:
-            parser = HfArgumentParser(dataclass_type)
-
-            self.argparsersEqual(parser, expected)
-
-            args = parser.parse_args([])
-            self.assertEqual(args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[]))
-
-            args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split())
-            self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3]))
-
-    def test_with_required(self):
-        parser = HfArgumentParser(RequiredExample)
-
-        expected = argparse.ArgumentParser()
-        expected.add_argument("--required_list", nargs="+", type=int, required=True)
-        expected.add_argument("--required_str", type=str, required=True)
-        expected.add_argument(
-            "--required_enum",
-            type=make_choice_type_function(["titi", "toto"]),
-            choices=["titi", "toto"],
-            required=True,
-        )
-        self.argparsersEqual(parser, expected)
-
-    def test_with_string_literal_annotation(self):
-        parser = HfArgumentParser(StringLiteralAnnotationExample)
-
-        expected = argparse.ArgumentParser()
-        expected.add_argument("--foo", type=int, required=True)
-        expected.add_argument(
-            "--required_enum",
-            type=make_choice_type_function(["titi", "toto"]),
-            choices=["titi", "toto"],
-            required=True,
-        )
-        expected.add_argument("--opt", type=string_to_bool, default=None)
-        expected.add_argument("--baz", default="toto", type=str, help="help message")
-        expected.add_argument("--foo_str", nargs="+", default=["Hallo", "Bonjour", "Hello"], type=str)
-        self.argparsersEqual(parser, expected)
-
-    def test_parse_dict(self):
-        parser = HfArgumentParser(BasicExample)
-
-        args_dict = {
-            "foo": 12,
-            "bar": 3.14,
-            "baz": "42",
-            "flag": True,
-        }
-
-        parsed_args = parser.parse_dict(args_dict)[0]
-        args = BasicExample(**args_dict)
-        self.assertEqual(parsed_args, args)
-
-    def test_parse_dict_extra_key(self):
-        parser = HfArgumentParser(BasicExample)
-
-        args_dict = {
-            "foo": 12,
-            "bar": 3.14,
-            "baz": "42",
-            "flag": True,
-            "extra": 42,
-        }
-
-        self.assertRaises(ValueError, parser.parse_dict, args_dict, allow_extra_keys=False)
-
-    def test_parse_json(self):
-        parser = HfArgumentParser(BasicExample)
-
-        args_dict_for_json = {
-            "foo": 12,
-            "bar": 3.14,
-            "baz": "42",
-            "flag": True,
-        }
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            temp_local_path = os.path.join(tmp_dir, "temp_json")
-            os.mkdir(temp_local_path)
-            with open(temp_local_path + ".json", "w+") as f:
-                json.dump(args_dict_for_json, f)
-            parsed_args = parser.parse_yaml_file(Path(temp_local_path + ".json"))[0]
-
-        args = BasicExample(**args_dict_for_json)
-        self.assertEqual(parsed_args, args)
-
-    def test_parse_yaml(self):
-        parser = HfArgumentParser(BasicExample)
-
-        args_dict_for_yaml = {
-            "foo": 12,
-            "bar": 3.14,
-            "baz": "42",
-            "flag": True,
-        }
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            temp_local_path = os.path.join(tmp_dir, "temp_yaml")
-            os.mkdir(temp_local_path)
-            with open(temp_local_path + ".yaml", "w+") as f:
-                yaml.dump(args_dict_for_yaml, f)
-            parsed_args = parser.parse_yaml_file(Path(temp_local_path + ".yaml"))[0]
-        args = BasicExample(**args_dict_for_yaml)
-        self.assertEqual(parsed_args, args)
-
-    def test_integration_training_args(self):
-        parser = HfArgumentParser(TrainingArguments)
-        self.assertIsNotNone(parser)
diff --git a/tests/transformers/tests/utils/test_hub_utils.py b/tests/transformers/tests/utils/test_hub_utils.py
deleted file mode 100644
index 836f4f5fa7..0000000000
--- a/tests/transformers/tests/utils/test_hub_utils.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-import tempfile
-import unittest
-import unittest.mock as mock
-from pathlib import Path
-
-from requests.exceptions import HTTPError
-from transformers.utils import (
-    CONFIG_NAME,
-    FLAX_WEIGHTS_NAME,
-    TF2_WEIGHTS_NAME,
-    TRANSFORMERS_CACHE,
-    WEIGHTS_NAME,
-    cached_file,
-    get_file_from_repo,
-    has_file,
-)
-
-
-RANDOM_BERT = "hf-internal-testing/tiny-random-bert"
-CACHE_DIR = os.path.join(TRANSFORMERS_CACHE, "models--hf-internal-testing--tiny-random-bert")
-FULL_COMMIT_HASH = "9b8c223d42b2188cb49d29af482996f9d0f3e5a6"
-
-GATED_REPO = "hf-internal-testing/dummy-gated-model"
-README_FILE = "README.md"
-
-
-class GetFromCacheTests(unittest.TestCase):
-    def test_cached_file(self):
-        archive_file = cached_file(RANDOM_BERT, CONFIG_NAME)
-        # Should have downloaded the file in here
-        self.assertTrue(os.path.isdir(CACHE_DIR))
-        # Cache should contain at least those three subfolders:
-        for subfolder in ["blobs", "refs", "snapshots"]:
-            self.assertTrue(os.path.isdir(os.path.join(CACHE_DIR, subfolder)))
-        with open(os.path.join(CACHE_DIR, "refs", "main")) as f:
-            main_commit = f.read()
-        self.assertEqual(archive_file, os.path.join(CACHE_DIR, "snapshots", main_commit, CONFIG_NAME))
-        self.assertTrue(os.path.isfile(archive_file))
-
-        # File is cached at the same place the second time.
-        new_archive_file = cached_file(RANDOM_BERT, CONFIG_NAME)
-        self.assertEqual(archive_file, new_archive_file)
-
-        # Using a specific revision to test the full commit hash.
-        archive_file = cached_file(RANDOM_BERT, CONFIG_NAME, revision="9b8c223")
-        self.assertEqual(archive_file, os.path.join(CACHE_DIR, "snapshots", FULL_COMMIT_HASH, CONFIG_NAME))
-
-    def test_cached_file_errors(self):
-        with self.assertRaisesRegex(EnvironmentError, "is not a valid model identifier"):
-            _ = cached_file("tiny-random-bert", CONFIG_NAME)
-
-        with self.assertRaisesRegex(EnvironmentError, "is not a valid git identifier"):
-            _ = cached_file(RANDOM_BERT, CONFIG_NAME, revision="aaaa")
-
-        with self.assertRaisesRegex(EnvironmentError, "does not appear to have a file named"):
-            _ = cached_file(RANDOM_BERT, "conf")
-
-    def test_non_existence_is_cached(self):
-        with self.assertRaisesRegex(EnvironmentError, "does not appear to have a file named"):
-            _ = cached_file(RANDOM_BERT, "conf")
-
-        with open(os.path.join(CACHE_DIR, "refs", "main")) as f:
-            main_commit = f.read()
-        self.assertTrue(os.path.isfile(os.path.join(CACHE_DIR, ".no_exist", main_commit, "conf")))
-
-        path = cached_file(RANDOM_BERT, "conf", _raise_exceptions_for_missing_entries=False)
-        self.assertIsNone(path)
-
-        path = cached_file(RANDOM_BERT, "conf", local_files_only=True, _raise_exceptions_for_missing_entries=False)
-        self.assertIsNone(path)
-
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
-            path = cached_file(RANDOM_BERT, "conf", _raise_exceptions_for_connection_errors=False)
-            self.assertIsNone(path)
-            # This check we did call the fake head request
-            mock_head.assert_called()
-
-    def test_has_file(self):
-        self.assertTrue(has_file("hf-internal-testing/tiny-bert-pt-only", WEIGHTS_NAME))
-        self.assertFalse(has_file("hf-internal-testing/tiny-bert-pt-only", TF2_WEIGHTS_NAME))
-        self.assertFalse(has_file("hf-internal-testing/tiny-bert-pt-only", FLAX_WEIGHTS_NAME))
-
-    def test_get_file_from_repo_distant(self):
-        # `get_file_from_repo` returns None if the file does not exist
-        self.assertIsNone(get_file_from_repo("bert-base-cased", "ahah.txt"))
-
-        # The function raises if the repository does not exist.
-        with self.assertRaisesRegex(EnvironmentError, "is not a valid model identifier"):
-            get_file_from_repo("bert-base-case", CONFIG_NAME)
-
-        # The function raises if the revision does not exist.
-        with self.assertRaisesRegex(EnvironmentError, "is not a valid git identifier"):
-            get_file_from_repo("bert-base-cased", CONFIG_NAME, revision="ahaha")
-
-        resolved_file = get_file_from_repo("bert-base-cased", CONFIG_NAME)
-        # The name is the cached name which is not very easy to test, so instead we load the content.
-        config = json.loads(open(resolved_file, "r").read())
-        self.assertEqual(config["hidden_size"], 768)
-
-    def test_get_file_from_repo_local(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            filename = Path(tmp_dir) / "a.txt"
-            filename.touch()
-            self.assertEqual(get_file_from_repo(tmp_dir, "a.txt"), str(filename))
-
-            self.assertIsNone(get_file_from_repo(tmp_dir, "b.txt"))
-
-    @unittest.skip("Test is broken, fix me Wauplain!")
-    def test_get_file_gated_repo(self):
-        """Test download file from a gated repo fails with correct message when not authenticated."""
-        with self.assertRaisesRegex(EnvironmentError, "You are trying to access a gated repo."):
-            cached_file(GATED_REPO, README_FILE, use_auth_token=False)
-
-    @unittest.skip("Test is broken, fix me Wauplain!")
-    def test_has_file_gated_repo(self):
-        """Test check file existence from a gated repo fails with correct message when not authenticated."""
-        with self.assertRaisesRegex(EnvironmentError, "is a gated repository"):
-            has_file(GATED_REPO, README_FILE, use_auth_token=False)
diff --git a/tests/transformers/tests/utils/test_image_processing_utils.py b/tests/transformers/tests/utils/test_image_processing_utils.py
deleted file mode 100644
index afb6283e6e..0000000000
--- a/tests/transformers/tests/utils/test_image_processing_utils.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers.image_processing_utils import get_size_dict
-
-
-class ImageProcessingUtilsTester(unittest.TestCase):
-    def test_get_size_dict(self):
-        # Test a dict with the wrong keys raises an error
-        inputs = {"wrong_key": 224}
-        with self.assertRaises(ValueError):
-            get_size_dict(inputs)
-
-        inputs = {"height": 224}
-        with self.assertRaises(ValueError):
-            get_size_dict(inputs)
-
-        inputs = {"width": 224, "shortest_edge": 224}
-        with self.assertRaises(ValueError):
-            get_size_dict(inputs)
-
-        # Test a dict with the correct keys is returned as is
-        inputs = {"height": 224, "width": 224}
-        outputs = get_size_dict(inputs)
-        self.assertEqual(outputs, inputs)
-
-        inputs = {"shortest_edge": 224}
-        outputs = get_size_dict(inputs)
-        self.assertEqual(outputs, {"shortest_edge": 224})
-
-        inputs = {"longest_edge": 224, "shortest_edge": 224}
-        outputs = get_size_dict(inputs)
-        self.assertEqual(outputs, {"longest_edge": 224, "shortest_edge": 224})
-
-        # Test a single int value which  represents (size, size)
-        outputs = get_size_dict(224)
-        self.assertEqual(outputs, {"height": 224, "width": 224})
-
-        # Test a single int value which represents the shortest edge
-        outputs = get_size_dict(224, default_to_square=False)
-        self.assertEqual(outputs, {"shortest_edge": 224})
-
-        # Test a tuple of ints which represents (height, width)
-        outputs = get_size_dict((150, 200))
-        self.assertEqual(outputs, {"height": 150, "width": 200})
-
-        # Test a tuple of ints which represents (width, height)
-        outputs = get_size_dict((150, 200), height_width_order=False)
-        self.assertEqual(outputs, {"height": 200, "width": 150})
-
-        # Test an int representing the shortest edge and max_size which represents the longest edge
-        outputs = get_size_dict(224, max_size=256, default_to_square=False)
-        self.assertEqual(outputs, {"shortest_edge": 224, "longest_edge": 256})
-
-        # Test int with default_to_square=True and max_size fails
-        with self.assertRaises(ValueError):
-            get_size_dict(224, max_size=256, default_to_square=True)
diff --git a/tests/transformers/tests/utils/test_image_utils.py b/tests/transformers/tests/utils/test_image_utils.py
deleted file mode 100644
index a5efb87c03..0000000000
--- a/tests/transformers/tests/utils/test_image_utils.py
+++ /dev/null
@@ -1,628 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import datasets
-import numpy as np
-import pytest
-from requests import ReadTimeout
-from transformers import is_torch_available, is_vision_available
-from transformers.image_utils import ChannelDimension, get_channel_dimension_axis, make_list_of_images
-from transformers.testing_utils import is_flaky, require_torch, require_vision
-
-from tests.pipelines.test_pipelines_document_question_answering import INVOICE_URL
-
-
-if is_torch_available():
-    import torch
-
-if is_vision_available():
-    import PIL.Image
-    from transformers import ImageFeatureExtractionMixin
-    from transformers.image_utils import get_image_size, infer_channel_dimension_format, load_image
-
-
-def get_random_image(height, width):
-    random_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
-    return PIL.Image.fromarray(random_array)
-
-
-@require_vision
-class ImageFeatureExtractionTester(unittest.TestCase):
-    def test_conversion_image_to_array(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        image = get_random_image(16, 32)
-
-        # Conversion with defaults (rescale + channel first)
-        array1 = feature_extractor.to_numpy_array(image)
-        self.assertTrue(array1.dtype, np.float32)
-        self.assertEqual(array1.shape, (3, 16, 32))
-
-        # Conversion with rescale and not channel first
-        array2 = feature_extractor.to_numpy_array(image, channel_first=False)
-        self.assertTrue(array2.dtype, np.float32)
-        self.assertEqual(array2.shape, (16, 32, 3))
-        self.assertTrue(np.array_equal(array1, array2.transpose(2, 0, 1)))
-
-        # Conversion with no rescale and channel first
-        array3 = feature_extractor.to_numpy_array(image, rescale=False)
-        self.assertTrue(array3.dtype, np.uint8)
-        self.assertEqual(array3.shape, (3, 16, 32))
-        self.assertTrue(np.array_equal(array1, array3.astype(np.float32) * (1 / 255.0)))
-
-        # Conversion with no rescale and not channel first
-        array4 = feature_extractor.to_numpy_array(image, rescale=False, channel_first=False)
-        self.assertTrue(array4.dtype, np.uint8)
-        self.assertEqual(array4.shape, (16, 32, 3))
-        self.assertTrue(np.array_equal(array2, array4.astype(np.float32) * (1 / 255.0)))
-
-    def test_conversion_array_to_array(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        array = np.random.randint(0, 256, (16, 32, 3), dtype=np.uint8)
-
-        # By default, rescale (for an array of ints) and channel permute
-        array1 = feature_extractor.to_numpy_array(array)
-        self.assertTrue(array1.dtype, np.float32)
-        self.assertEqual(array1.shape, (3, 16, 32))
-        self.assertTrue(np.array_equal(array1, array.transpose(2, 0, 1).astype(np.float32) * (1 / 255.0)))
-
-        # Same with no permute
-        array2 = feature_extractor.to_numpy_array(array, channel_first=False)
-        self.assertTrue(array2.dtype, np.float32)
-        self.assertEqual(array2.shape, (16, 32, 3))
-        self.assertTrue(np.array_equal(array2, array.astype(np.float32) * (1 / 255.0)))
-
-        # Force rescale to False
-        array3 = feature_extractor.to_numpy_array(array, rescale=False)
-        self.assertTrue(array3.dtype, np.uint8)
-        self.assertEqual(array3.shape, (3, 16, 32))
-        self.assertTrue(np.array_equal(array3, array.transpose(2, 0, 1)))
-
-        # Force rescale to False and no channel permute
-        array4 = feature_extractor.to_numpy_array(array, rescale=False, channel_first=False)
-        self.assertTrue(array4.dtype, np.uint8)
-        self.assertEqual(array4.shape, (16, 32, 3))
-        self.assertTrue(np.array_equal(array4, array))
-
-        # Now test the default rescale for a float array (defaults to False)
-        array5 = feature_extractor.to_numpy_array(array2)
-        self.assertTrue(array5.dtype, np.float32)
-        self.assertEqual(array5.shape, (3, 16, 32))
-        self.assertTrue(np.array_equal(array5, array1))
-
-    def test_make_list_of_images_numpy(self):
-        # Test a single image is converted to a list of 1 image
-        images = np.random.randint(0, 256, (16, 32, 3))
-        images_list = make_list_of_images(images)
-        self.assertEqual(len(images_list), 1)
-        self.assertTrue(np.array_equal(images_list[0], images))
-        self.assertIsInstance(images_list, list)
-
-        # Test a batch of images is converted to a list of images
-        images = np.random.randint(0, 256, (4, 16, 32, 3))
-        images_list = make_list_of_images(images)
-        self.assertEqual(len(images_list), 4)
-        self.assertTrue(np.array_equal(images_list[0], images[0]))
-        self.assertIsInstance(images_list, list)
-
-        # Test a list of images is not modified
-        images = [np.random.randint(0, 256, (16, 32, 3)) for _ in range(4)]
-        images_list = make_list_of_images(images)
-        self.assertEqual(len(images_list), 4)
-        self.assertTrue(np.array_equal(images_list[0], images[0]))
-        self.assertIsInstance(images_list, list)
-
-        # Test batched masks with no channel dimension are converted to a list of masks
-        masks = np.random.randint(0, 2, (4, 16, 32))
-        masks_list = make_list_of_images(masks, expected_ndims=2)
-        self.assertEqual(len(masks_list), 4)
-        self.assertTrue(np.array_equal(masks_list[0], masks[0]))
-        self.assertIsInstance(masks_list, list)
-
-    @require_torch
-    def test_make_list_of_images_torch(self):
-        # Test a single image is converted to a list of 1 image
-        images = torch.randint(0, 256, (16, 32, 3))
-        images_list = make_list_of_images(images)
-        self.assertEqual(len(images_list), 1)
-        self.assertTrue(np.array_equal(images_list[0], images))
-        self.assertIsInstance(images_list, list)
-
-        # Test a batch of images is converted to a list of images
-        images = torch.randint(0, 256, (4, 16, 32, 3))
-        images_list = make_list_of_images(images)
-        self.assertEqual(len(images_list), 4)
-        self.assertTrue(np.array_equal(images_list[0], images[0]))
-        self.assertIsInstance(images_list, list)
-
-        # Test a list of images is left unchanged
-        images = [torch.randint(0, 256, (16, 32, 3)) for _ in range(4)]
-        images_list = make_list_of_images(images)
-        self.assertEqual(len(images_list), 4)
-        self.assertTrue(np.array_equal(images_list[0], images[0]))
-        self.assertIsInstance(images_list, list)
-
-    @require_torch
-    def test_conversion_torch_to_array(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        tensor = torch.randint(0, 256, (16, 32, 3))
-        array = tensor.numpy()
-
-        # By default, rescale (for a tensor of ints) and channel permute
-        array1 = feature_extractor.to_numpy_array(array)
-        self.assertTrue(array1.dtype, np.float32)
-        self.assertEqual(array1.shape, (3, 16, 32))
-        self.assertTrue(np.array_equal(array1, array.transpose(2, 0, 1).astype(np.float32) * (1 / 255.0)))
-
-        # Same with no permute
-        array2 = feature_extractor.to_numpy_array(array, channel_first=False)
-        self.assertTrue(array2.dtype, np.float32)
-        self.assertEqual(array2.shape, (16, 32, 3))
-        self.assertTrue(np.array_equal(array2, array.astype(np.float32) * (1 / 255.0)))
-
-        # Force rescale to False
-        array3 = feature_extractor.to_numpy_array(array, rescale=False)
-        self.assertTrue(array3.dtype, np.uint8)
-        self.assertEqual(array3.shape, (3, 16, 32))
-        self.assertTrue(np.array_equal(array3, array.transpose(2, 0, 1)))
-
-        # Force rescale to False and no channel permute
-        array4 = feature_extractor.to_numpy_array(array, rescale=False, channel_first=False)
-        self.assertTrue(array4.dtype, np.uint8)
-        self.assertEqual(array4.shape, (16, 32, 3))
-        self.assertTrue(np.array_equal(array4, array))
-
-        # Now test the default rescale for a float tensor (defaults to False)
-        array5 = feature_extractor.to_numpy_array(array2)
-        self.assertTrue(array5.dtype, np.float32)
-        self.assertEqual(array5.shape, (3, 16, 32))
-        self.assertTrue(np.array_equal(array5, array1))
-
-    def test_conversion_image_to_image(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        image = get_random_image(16, 32)
-
-        # On an image, `to_pil_image1` is a noop.
-        image1 = feature_extractor.to_pil_image(image)
-        self.assertTrue(isinstance(image, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image), np.array(image1)))
-
-    def test_conversion_array_to_image(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        array = np.random.randint(0, 256, (16, 32, 3), dtype=np.uint8)
-
-        # By default, no rescale (for an array of ints)
-        image1 = feature_extractor.to_pil_image(array)
-        self.assertTrue(isinstance(image1, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image1), array))
-
-        # If the array is channel-first, proper reordering of the channels is done.
-        image2 = feature_extractor.to_pil_image(array.transpose(2, 0, 1))
-        self.assertTrue(isinstance(image2, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image2), array))
-
-        # If the array has floating type, it's rescaled by default.
-        image3 = feature_extractor.to_pil_image(array.astype(np.float32) * (1 / 255.0))
-        self.assertTrue(isinstance(image3, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image3), array))
-
-        # You can override the default to rescale.
-        image4 = feature_extractor.to_pil_image(array.astype(np.float32), rescale=False)
-        self.assertTrue(isinstance(image4, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image4), array))
-
-        # And with floats + channel first.
-        image5 = feature_extractor.to_pil_image(array.transpose(2, 0, 1).astype(np.float32) * (1 / 255.0))
-        self.assertTrue(isinstance(image5, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image5), array))
-
-    @require_torch
-    def test_conversion_tensor_to_image(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        tensor = torch.randint(0, 256, (16, 32, 3))
-        array = tensor.numpy()
-
-        # By default, no rescale (for a tensor of ints)
-        image1 = feature_extractor.to_pil_image(tensor)
-        self.assertTrue(isinstance(image1, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image1), array))
-
-        # If the tensor is channel-first, proper reordering of the channels is done.
-        image2 = feature_extractor.to_pil_image(tensor.permute(2, 0, 1))
-        self.assertTrue(isinstance(image2, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image2), array))
-
-        # If the tensor has floating type, it's rescaled by default.
-        image3 = feature_extractor.to_pil_image(tensor.float() / 255.0)
-        self.assertTrue(isinstance(image3, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image3), array))
-
-        # You can override the default to rescale.
-        image4 = feature_extractor.to_pil_image(tensor.float(), rescale=False)
-        self.assertTrue(isinstance(image4, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image4), array))
-
-        # And with floats + channel first.
-        image5 = feature_extractor.to_pil_image(tensor.permute(2, 0, 1).float() * (1 / 255.0))
-        self.assertTrue(isinstance(image5, PIL.Image.Image))
-        self.assertTrue(np.array_equal(np.array(image5), array))
-
-    def test_resize_image_and_array(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        image = get_random_image(16, 32)
-        array = np.array(image)
-
-        # Size can be an int or a tuple of ints.
-        resized_image = feature_extractor.resize(image, 8)
-        self.assertTrue(isinstance(resized_image, PIL.Image.Image))
-        self.assertEqual(resized_image.size, (8, 8))
-
-        resized_image1 = feature_extractor.resize(image, (8, 16))
-        self.assertTrue(isinstance(resized_image1, PIL.Image.Image))
-        self.assertEqual(resized_image1.size, (8, 16))
-
-        # Passing an array converts it to a PIL Image.
-        resized_image2 = feature_extractor.resize(array, 8)
-        self.assertTrue(isinstance(resized_image2, PIL.Image.Image))
-        self.assertEqual(resized_image2.size, (8, 8))
-        self.assertTrue(np.array_equal(np.array(resized_image), np.array(resized_image2)))
-
-        resized_image3 = feature_extractor.resize(image, (8, 16))
-        self.assertTrue(isinstance(resized_image3, PIL.Image.Image))
-        self.assertEqual(resized_image3.size, (8, 16))
-        self.assertTrue(np.array_equal(np.array(resized_image1), np.array(resized_image3)))
-
-    def test_resize_image_and_array_non_default_to_square(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-
-        heights_widths = [
-            # height, width
-            # square image
-            (28, 28),
-            (27, 27),
-            # rectangular image: h < w
-            (28, 34),
-            (29, 35),
-            # rectangular image: h > w
-            (34, 28),
-            (35, 29),
-        ]
-
-        # single integer or single integer in tuple/list
-        sizes = [22, 27, 28, 36, [22], (27,)]
-
-        for (height, width), size in zip(heights_widths, sizes):
-            for max_size in (None, 37, 1000):
-                image = get_random_image(height, width)
-                array = np.array(image)
-
-                size = size[0] if isinstance(size, (list, tuple)) else size
-                # Size can be an int or a tuple of ints.
-                # If size is an int, smaller edge of the image will be matched to this number.
-                # i.e, if height > width, then image will be rescaled to (size * height / width, size).
-                if height < width:
-                    exp_w, exp_h = (int(size * width / height), size)
-                    if max_size is not None and max_size < exp_w:
-                        exp_w, exp_h = max_size, int(max_size * exp_h / exp_w)
-                elif width < height:
-                    exp_w, exp_h = (size, int(size * height / width))
-                    if max_size is not None and max_size < exp_h:
-                        exp_w, exp_h = int(max_size * exp_w / exp_h), max_size
-                else:
-                    exp_w, exp_h = (size, size)
-                    if max_size is not None and max_size < size:
-                        exp_w, exp_h = max_size, max_size
-
-                resized_image = feature_extractor.resize(image, size=size, default_to_square=False, max_size=max_size)
-                self.assertTrue(isinstance(resized_image, PIL.Image.Image))
-                self.assertEqual(resized_image.size, (exp_w, exp_h))
-
-                # Passing an array converts it to a PIL Image.
-                resized_image2 = feature_extractor.resize(array, size=size, default_to_square=False, max_size=max_size)
-                self.assertTrue(isinstance(resized_image2, PIL.Image.Image))
-                self.assertEqual(resized_image2.size, (exp_w, exp_h))
-                self.assertTrue(np.array_equal(np.array(resized_image), np.array(resized_image2)))
-
-    @require_torch
-    def test_resize_tensor(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        tensor = torch.randint(0, 256, (16, 32, 3))
-        array = tensor.numpy()
-
-        # Size can be an int or a tuple of ints.
-        resized_image = feature_extractor.resize(tensor, 8)
-        self.assertTrue(isinstance(resized_image, PIL.Image.Image))
-        self.assertEqual(resized_image.size, (8, 8))
-
-        resized_image1 = feature_extractor.resize(tensor, (8, 16))
-        self.assertTrue(isinstance(resized_image1, PIL.Image.Image))
-        self.assertEqual(resized_image1.size, (8, 16))
-
-        # Check we get the same results as with NumPy arrays.
-        resized_image2 = feature_extractor.resize(array, 8)
-        self.assertTrue(np.array_equal(np.array(resized_image), np.array(resized_image2)))
-
-        resized_image3 = feature_extractor.resize(array, (8, 16))
-        self.assertTrue(np.array_equal(np.array(resized_image1), np.array(resized_image3)))
-
-    def test_normalize_image(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        image = get_random_image(16, 32)
-        array = np.array(image)
-        mean = [0.1, 0.5, 0.9]
-        std = [0.2, 0.4, 0.6]
-
-        # PIL Image are converted to NumPy arrays for the normalization
-        normalized_image = feature_extractor.normalize(image, mean, std)
-        self.assertTrue(isinstance(normalized_image, np.ndarray))
-        self.assertEqual(normalized_image.shape, (3, 16, 32))
-
-        # During the conversion rescale and channel first will be applied.
-        expected = array.transpose(2, 0, 1).astype(np.float32) * (1 / 255.0)
-        np_mean = np.array(mean).astype(np.float32)[:, None, None]
-        np_std = np.array(std).astype(np.float32)[:, None, None]
-        expected = (expected - np_mean) / np_std
-        self.assertTrue(np.array_equal(normalized_image, expected))
-
-    def test_normalize_array(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        array = np.random.random((16, 32, 3))
-        mean = [0.1, 0.5, 0.9]
-        std = [0.2, 0.4, 0.6]
-
-        # mean and std can be passed as lists or NumPy arrays.
-        expected = (array - np.array(mean)) / np.array(std)
-        normalized_array = feature_extractor.normalize(array, mean, std)
-        self.assertTrue(np.array_equal(normalized_array, expected))
-
-        normalized_array = feature_extractor.normalize(array, np.array(mean), np.array(std))
-        self.assertTrue(np.array_equal(normalized_array, expected))
-
-        # Normalize will detect automatically if channel first or channel last is used.
-        array = np.random.random((3, 16, 32))
-        expected = (array - np.array(mean)[:, None, None]) / np.array(std)[:, None, None]
-        normalized_array = feature_extractor.normalize(array, mean, std)
-        self.assertTrue(np.array_equal(normalized_array, expected))
-
-        normalized_array = feature_extractor.normalize(array, np.array(mean), np.array(std))
-        self.assertTrue(np.array_equal(normalized_array, expected))
-
-    @require_torch
-    def test_normalize_tensor(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        tensor = torch.rand(16, 32, 3)
-        mean = [0.1, 0.5, 0.9]
-        std = [0.2, 0.4, 0.6]
-
-        # mean and std can be passed as lists or tensors.
-        expected = (tensor - torch.tensor(mean)) / torch.tensor(std)
-        normalized_tensor = feature_extractor.normalize(tensor, mean, std)
-        self.assertTrue(torch.equal(normalized_tensor, expected))
-
-        normalized_tensor = feature_extractor.normalize(tensor, torch.tensor(mean), torch.tensor(std))
-        self.assertTrue(torch.equal(normalized_tensor, expected))
-
-        # Normalize will detect automatically if channel first or channel last is used.
-        tensor = torch.rand(3, 16, 32)
-        expected = (tensor - torch.tensor(mean)[:, None, None]) / torch.tensor(std)[:, None, None]
-        normalized_tensor = feature_extractor.normalize(tensor, mean, std)
-        self.assertTrue(torch.equal(normalized_tensor, expected))
-
-        normalized_tensor = feature_extractor.normalize(tensor, torch.tensor(mean), torch.tensor(std))
-        self.assertTrue(torch.equal(normalized_tensor, expected))
-
-    def test_center_crop_image(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        image = get_random_image(16, 32)
-
-        # Test various crop sizes: bigger on all dimensions, on one of the dimensions only and on both dimensions.
-        crop_sizes = [8, (8, 64), 20, (32, 64)]
-        for size in crop_sizes:
-            cropped_image = feature_extractor.center_crop(image, size)
-            self.assertTrue(isinstance(cropped_image, PIL.Image.Image))
-
-            # PIL Image.size is transposed compared to NumPy or PyTorch (width first instead of height first).
-            expected_size = (size, size) if isinstance(size, int) else (size[1], size[0])
-            self.assertEqual(cropped_image.size, expected_size)
-
-    def test_center_crop_array(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        image = get_random_image(16, 32)
-        array = feature_extractor.to_numpy_array(image)
-
-        # Test various crop sizes: bigger on all dimensions, on one of the dimensions only and on both dimensions.
-        crop_sizes = [8, (8, 64), 20, (32, 64)]
-        for size in crop_sizes:
-            cropped_array = feature_extractor.center_crop(array, size)
-            self.assertTrue(isinstance(cropped_array, np.ndarray))
-
-            expected_size = (size, size) if isinstance(size, int) else size
-            self.assertEqual(cropped_array.shape[-2:], expected_size)
-
-            # Check result is consistent with PIL.Image.crop
-            cropped_image = feature_extractor.center_crop(image, size)
-            self.assertTrue(np.array_equal(cropped_array, feature_extractor.to_numpy_array(cropped_image)))
-
-    @require_torch
-    def test_center_crop_tensor(self):
-        feature_extractor = ImageFeatureExtractionMixin()
-        image = get_random_image(16, 32)
-        array = feature_extractor.to_numpy_array(image)
-        tensor = torch.tensor(array)
-
-        # Test various crop sizes: bigger on all dimensions, on one of the dimensions only and on both dimensions.
-        crop_sizes = [8, (8, 64), 20, (32, 64)]
-        for size in crop_sizes:
-            cropped_tensor = feature_extractor.center_crop(tensor, size)
-            self.assertTrue(isinstance(cropped_tensor, torch.Tensor))
-
-            expected_size = (size, size) if isinstance(size, int) else size
-            self.assertEqual(cropped_tensor.shape[-2:], expected_size)
-
-            # Check result is consistent with PIL.Image.crop
-            cropped_image = feature_extractor.center_crop(image, size)
-            self.assertTrue(torch.equal(cropped_tensor, torch.tensor(feature_extractor.to_numpy_array(cropped_image))))
-
-
-@require_vision
-class LoadImageTester(unittest.TestCase):
-    def test_load_img_url(self):
-        img = load_image(INVOICE_URL)
-        img_arr = np.array(img)
-
-        self.assertEqual(img_arr.shape, (1061, 750, 3))
-
-    @is_flaky()
-    def test_load_img_url_timeout(self):
-        with self.assertRaises(ReadTimeout):
-            load_image(INVOICE_URL, timeout=0.001)
-
-    def test_load_img_local(self):
-        img = load_image("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        img_arr = np.array(img)
-
-        self.assertEqual(
-            img_arr.shape,
-            (480, 640, 3),
-        )
-
-    def test_load_img_rgba(self):
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
-
-        img = load_image(dataset[0]["file"])  # img with mode RGBA
-        img_arr = np.array(img)
-
-        self.assertEqual(
-            img_arr.shape,
-            (512, 512, 3),
-        )
-
-    def test_load_img_la(self):
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
-
-        img = load_image(dataset[1]["file"])  # img with mode LA
-        img_arr = np.array(img)
-
-        self.assertEqual(
-            img_arr.shape,
-            (512, 768, 3),
-        )
-
-    def test_load_img_l(self):
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
-
-        img = load_image(dataset[2]["file"])  # img with mode L
-        img_arr = np.array(img)
-
-        self.assertEqual(
-            img_arr.shape,
-            (381, 225, 3),
-        )
-
-    def test_load_img_exif_transpose(self):
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
-        img_file = dataset[3]["file"]
-
-        img_without_exif_transpose = PIL.Image.open(img_file)
-        img_arr_without_exif_transpose = np.array(img_without_exif_transpose)
-
-        self.assertEqual(
-            img_arr_without_exif_transpose.shape,
-            (333, 500, 3),
-        )
-
-        img_with_exif_transpose = load_image(img_file)
-        img_arr_with_exif_transpose = np.array(img_with_exif_transpose)
-
-        self.assertEqual(
-            img_arr_with_exif_transpose.shape,
-            (500, 333, 3),
-        )
-
-
-class UtilFunctionTester(unittest.TestCase):
-    def test_get_image_size(self):
-        # Test we can infer the size and channel dimension of an image.
-        image = np.random.randint(0, 256, (32, 64, 3))
-        self.assertEqual(get_image_size(image), (32, 64))
-
-        image = np.random.randint(0, 256, (3, 32, 64))
-        self.assertEqual(get_image_size(image), (32, 64))
-
-        # Test the channel dimension can be overriden
-        image = np.random.randint(0, 256, (3, 32, 64))
-        self.assertEqual(get_image_size(image, channel_dim=ChannelDimension.LAST), (3, 32))
-
-    def test_infer_channel_dimension(self):
-        # Test we fail with invalid input
-        with pytest.raises(ValueError):
-            infer_channel_dimension_format(np.random.randint(0, 256, (10, 10)))
-
-        with pytest.raises(ValueError):
-            infer_channel_dimension_format(np.random.randint(0, 256, (10, 10, 10, 10, 10)))
-
-        # Test we fail if neither first not last dimension is of size 3 or 1
-        with pytest.raises(ValueError):
-            infer_channel_dimension_format(np.random.randint(0, 256, (10, 1, 50)))
-
-        # But if we explicitly set one of the number of channels to 50 it works
-        inferred_dim = infer_channel_dimension_format(np.random.randint(0, 256, (10, 1, 50)), num_channels=50)
-        self.assertEqual(inferred_dim, ChannelDimension.LAST)
-
-        # Test we correctly identify the channel dimension
-        image = np.random.randint(0, 256, (3, 4, 5))
-        inferred_dim = infer_channel_dimension_format(image)
-        self.assertEqual(inferred_dim, ChannelDimension.FIRST)
-
-        image = np.random.randint(0, 256, (1, 4, 5))
-        inferred_dim = infer_channel_dimension_format(image)
-        self.assertEqual(inferred_dim, ChannelDimension.FIRST)
-
-        image = np.random.randint(0, 256, (4, 5, 3))
-        inferred_dim = infer_channel_dimension_format(image)
-        self.assertEqual(inferred_dim, ChannelDimension.LAST)
-
-        image = np.random.randint(0, 256, (4, 5, 1))
-        inferred_dim = infer_channel_dimension_format(image)
-        self.assertEqual(inferred_dim, ChannelDimension.LAST)
-
-        # We can take a batched array of images and find the dimension
-        image = np.random.randint(0, 256, (1, 3, 4, 5))
-        inferred_dim = infer_channel_dimension_format(image)
-        self.assertEqual(inferred_dim, ChannelDimension.FIRST)
-
-    def test_get_channel_dimension_axis(self):
-        # Test we correctly identify the channel dimension
-        image = np.random.randint(0, 256, (3, 4, 5))
-        inferred_axis = get_channel_dimension_axis(image)
-        self.assertEqual(inferred_axis, 0)
-
-        image = np.random.randint(0, 256, (1, 4, 5))
-        inferred_axis = get_channel_dimension_axis(image)
-        self.assertEqual(inferred_axis, 0)
-
-        image = np.random.randint(0, 256, (4, 5, 3))
-        inferred_axis = get_channel_dimension_axis(image)
-        self.assertEqual(inferred_axis, 2)
-
-        image = np.random.randint(0, 256, (4, 5, 1))
-        inferred_axis = get_channel_dimension_axis(image)
-        self.assertEqual(inferred_axis, 2)
-
-        # We can take a batched array of images and find the dimension
-        image = np.random.randint(0, 256, (1, 3, 4, 5))
-        inferred_axis = get_channel_dimension_axis(image)
-        self.assertEqual(inferred_axis, 1)
diff --git a/tests/transformers/tests/utils/test_logging.py b/tests/transformers/tests/utils/test_logging.py
deleted file mode 100644
index 50b3ad78cf..0000000000
--- a/tests/transformers/tests/utils/test_logging.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import transformers.models.bart.tokenization_bart
-from huggingface_hub.utils import are_progress_bars_disabled
-from transformers import logging
-from transformers.testing_utils import CaptureLogger, mockenv, mockenv_context
-from transformers.utils.logging import disable_progress_bar, enable_progress_bar
-
-
-class HfArgumentParserTest(unittest.TestCase):
-    def test_set_level(self):
-        logger = logging.get_logger()
-
-        # the current default level is logging.WARNING
-        level_origin = logging.get_verbosity()
-
-        logging.set_verbosity_error()
-        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())
-
-        logging.set_verbosity_warning()
-        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())
-
-        logging.set_verbosity_info()
-        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())
-
-        logging.set_verbosity_debug()
-        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())
-
-        # restore to the original level
-        logging.set_verbosity(level_origin)
-
-    def test_integration(self):
-        level_origin = logging.get_verbosity()
-
-        logger = logging.get_logger("transformers.models.bart.tokenization_bart")
-        msg = "Testing 1, 2, 3"
-
-        # should be able to log warnings (if default settings weren't overridden by `pytest --log-level-all`)
-        if level_origin <= logging.WARNING:
-            with CaptureLogger(logger) as cl:
-                logger.warning(msg)
-            self.assertEqual(cl.out, msg + "\n")
-
-        # this is setting the level for all of `transformers.*` loggers
-        logging.set_verbosity_error()
-
-        # should not be able to log warnings
-        with CaptureLogger(logger) as cl:
-            logger.warning(msg)
-        self.assertEqual(cl.out, "")
-
-        # should be able to log warnings again
-        logging.set_verbosity_warning()
-        with CaptureLogger(logger) as cl:
-            logger.warning(msg)
-        self.assertEqual(cl.out, msg + "\n")
-
-        # restore to the original level
-        logging.set_verbosity(level_origin)
-
-    @mockenv(TRANSFORMERS_VERBOSITY="error")
-    def test_env_override(self):
-        # reset for the env var to take effect, next time some logger call is made
-        transformers.utils.logging._reset_library_root_logger()
-        # this action activates the env var
-        _ = logging.get_logger("transformers.models.bart.tokenization_bart")
-
-        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
-        env_level = logging.log_levels[env_level_str]
-
-        current_level = logging.get_verbosity()
-        self.assertEqual(
-            env_level,
-            current_level,
-            f"TRANSFORMERS_VERBOSITY={env_level_str}/{env_level}, but internal verbosity is {current_level}",
-        )
-
-        # restore to the original level
-        os.environ["TRANSFORMERS_VERBOSITY"] = ""
-        transformers.utils.logging._reset_library_root_logger()
-
-    @mockenv(TRANSFORMERS_VERBOSITY="super-error")
-    def test_env_invalid_override(self):
-        # reset for the env var to take effect, next time some logger call is made
-        transformers.utils.logging._reset_library_root_logger()
-        logger = logging.logging.getLogger()
-        with CaptureLogger(logger) as cl:
-            # this action activates the env var
-            logging.get_logger("transformers.models.bart.tokenization_bart")
-        self.assertIn("Unknown option TRANSFORMERS_VERBOSITY=super-error", cl.out)
-
-        # no need to restore as nothing was changed
-
-    def test_advisory_warnings(self):
-        # testing `logger.warning_advice()`
-        transformers.utils.logging._reset_library_root_logger()
-
-        logger = logging.get_logger("transformers.models.bart.tokenization_bart")
-        msg = "Testing 1, 2, 3"
-
-        with mockenv_context(TRANSFORMERS_NO_ADVISORY_WARNINGS="1"):
-            # nothing should be logged as env var disables this method
-            with CaptureLogger(logger) as cl:
-                logger.warning_advice(msg)
-            self.assertEqual(cl.out, "")
-
-        with mockenv_context(TRANSFORMERS_NO_ADVISORY_WARNINGS=""):
-            # should log normally as TRANSFORMERS_NO_ADVISORY_WARNINGS is unset
-            with CaptureLogger(logger) as cl:
-                logger.warning_advice(msg)
-            self.assertEqual(cl.out, msg + "\n")
-
-
-def test_set_progress_bar_enabled():
-    disable_progress_bar()
-    assert are_progress_bars_disabled()
-
-    enable_progress_bar()
-    assert not are_progress_bars_disabled()
diff --git a/tests/transformers/tests/utils/test_model_card.py b/tests/transformers/tests/utils/test_model_card.py
deleted file mode 100644
index 7d0e8795e0..0000000000
--- a/tests/transformers/tests/utils/test_model_card.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import tempfile
-import unittest
-
-from transformers.modelcard import ModelCard
-
-
-class ModelCardTester(unittest.TestCase):
-    def setUp(self):
-        self.inputs_dict = {
-            "model_details": {
-                "Organization": "testing",
-                "Model date": "today",
-                "Model version": "v2.1, Developed by Test Corp in 2019.",
-                "Architecture": "Convolutional Neural Network.",
-            },
-            "metrics": "BLEU and ROUGE-1",
-            "evaluation_data": {
-                "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1"},
-                "Preprocessing": "See details on https://arxiv.org/pdf/1810.03993.pdf",
-            },
-            "training_data": {
-                "Dataset": "English Wikipedia dump dated 2018-12-01",
-                "Preprocessing": (
-                    "Using SentencePiece vocabulary of size 52k tokens. See details on"
-                    " https://arxiv.org/pdf/1810.03993.pdf"
-                ),
-            },
-            "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76},
-        }
-
-    def test_model_card_common_properties(self):
-        modelcard = ModelCard.from_dict(self.inputs_dict)
-        self.assertTrue(hasattr(modelcard, "model_details"))
-        self.assertTrue(hasattr(modelcard, "intended_use"))
-        self.assertTrue(hasattr(modelcard, "factors"))
-        self.assertTrue(hasattr(modelcard, "metrics"))
-        self.assertTrue(hasattr(modelcard, "evaluation_data"))
-        self.assertTrue(hasattr(modelcard, "training_data"))
-        self.assertTrue(hasattr(modelcard, "quantitative_analyses"))
-        self.assertTrue(hasattr(modelcard, "ethical_considerations"))
-        self.assertTrue(hasattr(modelcard, "caveats_and_recommendations"))
-
-    def test_model_card_to_json_string(self):
-        modelcard = ModelCard.from_dict(self.inputs_dict)
-        obj = json.loads(modelcard.to_json_string())
-        for key, value in self.inputs_dict.items():
-            self.assertEqual(obj[key], value)
-
-    def test_model_card_to_json_file(self):
-        model_card_first = ModelCard.from_dict(self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            filename = os.path.join(tmpdirname, "modelcard.json")
-            model_card_first.to_json_file(filename)
-            model_card_second = ModelCard.from_json_file(filename)
-
-        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
-
-    def test_model_card_from_and_save_pretrained(self):
-        model_card_first = ModelCard.from_dict(self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model_card_first.save_pretrained(tmpdirname)
-            model_card_second = ModelCard.from_pretrained(tmpdirname)
-
-        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
diff --git a/tests/transformers/tests/utils/test_model_output.py b/tests/transformers/tests/utils/test_model_output.py
deleted file mode 100644
index b415b6c2ef..0000000000
--- a/tests/transformers/tests/utils/test_model_output.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Hugging Face Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from dataclasses import dataclass
-from typing import Optional
-
-from transformers.testing_utils import require_torch
-from transformers.utils import ModelOutput
-
-
-@dataclass
-class ModelOutputTest(ModelOutput):
-    a: float
-    b: Optional[float] = None
-    c: Optional[float] = None
-
-
-class ModelOutputTester(unittest.TestCase):
-    def test_get_attributes(self):
-        x = ModelOutputTest(a=30)
-        self.assertEqual(x.a, 30)
-        self.assertIsNone(x.b)
-        self.assertIsNone(x.c)
-        with self.assertRaises(AttributeError):
-            _ = x.d
-
-    def test_index_with_ints_and_slices(self):
-        x = ModelOutputTest(a=30, b=10)
-        self.assertEqual(x[0], 30)
-        self.assertEqual(x[1], 10)
-        self.assertEqual(x[:2], (30, 10))
-        self.assertEqual(x[:], (30, 10))
-
-        x = ModelOutputTest(a=30, c=10)
-        self.assertEqual(x[0], 30)
-        self.assertEqual(x[1], 10)
-        self.assertEqual(x[:2], (30, 10))
-        self.assertEqual(x[:], (30, 10))
-
-    def test_index_with_strings(self):
-        x = ModelOutputTest(a=30, b=10)
-        self.assertEqual(x["a"], 30)
-        self.assertEqual(x["b"], 10)
-        with self.assertRaises(KeyError):
-            _ = x["c"]
-
-        x = ModelOutputTest(a=30, c=10)
-        self.assertEqual(x["a"], 30)
-        self.assertEqual(x["c"], 10)
-        with self.assertRaises(KeyError):
-            _ = x["b"]
-
-    def test_dict_like_properties(self):
-        x = ModelOutputTest(a=30)
-        self.assertEqual(list(x.keys()), ["a"])
-        self.assertEqual(list(x.values()), [30])
-        self.assertEqual(list(x.items()), [("a", 30)])
-        self.assertEqual(list(x), ["a"])
-
-        x = ModelOutputTest(a=30, b=10)
-        self.assertEqual(list(x.keys()), ["a", "b"])
-        self.assertEqual(list(x.values()), [30, 10])
-        self.assertEqual(list(x.items()), [("a", 30), ("b", 10)])
-        self.assertEqual(list(x), ["a", "b"])
-
-        x = ModelOutputTest(a=30, c=10)
-        self.assertEqual(list(x.keys()), ["a", "c"])
-        self.assertEqual(list(x.values()), [30, 10])
-        self.assertEqual(list(x.items()), [("a", 30), ("c", 10)])
-        self.assertEqual(list(x), ["a", "c"])
-
-        with self.assertRaises(Exception):
-            x = x.update({"d": 20})
-        with self.assertRaises(Exception):
-            del x["a"]
-        with self.assertRaises(Exception):
-            _ = x.pop("a")
-        with self.assertRaises(Exception):
-            _ = x.setdefault("d", 32)
-
-    def test_set_attributes(self):
-        x = ModelOutputTest(a=30)
-        x.a = 10
-        self.assertEqual(x.a, 10)
-        self.assertEqual(x["a"], 10)
-
-    def test_set_keys(self):
-        x = ModelOutputTest(a=30)
-        x["a"] = 10
-        self.assertEqual(x.a, 10)
-        self.assertEqual(x["a"], 10)
-
-    def test_instantiate_from_dict(self):
-        x = ModelOutputTest({"a": 30, "b": 10})
-        self.assertEqual(list(x.keys()), ["a", "b"])
-        self.assertEqual(x.a, 30)
-        self.assertEqual(x.b, 10)
-
-    def test_instantiate_from_iterator(self):
-        x = ModelOutputTest([("a", 30), ("b", 10)])
-        self.assertEqual(list(x.keys()), ["a", "b"])
-        self.assertEqual(x.a, 30)
-        self.assertEqual(x.b, 10)
-
-        with self.assertRaises(ValueError):
-            _ = ModelOutputTest([("a", 30), (10, 10)])
-
-        x = ModelOutputTest(a=(30, 30))
-        self.assertEqual(list(x.keys()), ["a"])
-        self.assertEqual(x.a, (30, 30))
-
-    @require_torch
-    def test_torch_pytree(self):
-        # ensure torch.utils._pytree treats ModelOutput subclasses as nodes (and not leaves)
-        # this is important for DistributedDataParallel gradient synchronization with static_graph=True
-        import torch
-        import torch.utils._pytree
-
-        x = ModelOutputTest(a=1.0, c=2.0)
-        self.assertFalse(torch.utils._pytree._is_leaf(x))
-
-        expected_flat_outs = [1.0, 2.0]
-        expected_tree_spec = torch.utils._pytree.TreeSpec(
-            ModelOutputTest, ["a", "c"], [torch.utils._pytree.LeafSpec(), torch.utils._pytree.LeafSpec()]
-        )
-
-        actual_flat_outs, actual_tree_spec = torch.utils._pytree.tree_flatten(x)
-        self.assertEqual(expected_flat_outs, actual_flat_outs)
-        self.assertEqual(expected_tree_spec, actual_tree_spec)
-
-        unflattened_x = torch.utils._pytree.tree_unflatten(actual_flat_outs, actual_tree_spec)
-        self.assertEqual(x, unflattened_x)
diff --git a/tests/transformers/tests/utils/test_modeling_tf_core.py b/tests/transformers/tests/utils/test_modeling_tf_core.py
deleted file mode 100644
index d170dba3ad..0000000000
--- a/tests/transformers/tests/utils/test_modeling_tf_core.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import copy
-import os
-import tempfile
-from importlib import import_module
-from math import isnan
-
-from transformers import is_tf_available
-from transformers.models.auto import get_values
-from transformers.testing_utils import _tf_gpu_memory_limit, require_tf, slow
-
-from ..test_modeling_tf_common import ids_tensor
-
-
-if is_tf_available():
-    import numpy as np
-    import tensorflow as tf
-    from transformers import (
-        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_MASKED_LM_MAPPING,
-        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-        TF_MODEL_FOR_PRETRAINING_MAPPING,
-        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        TFSharedEmbeddings,
-    )
-
-    if _tf_gpu_memory_limit is not None:
-        gpus = tf.config.list_physical_devices("GPU")
-        for gpu in gpus:
-            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
-            try:
-                tf.config.set_logical_device_configuration(
-                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
-                )
-                logical_gpus = tf.config.list_logical_devices("GPU")
-                print("Logical GPUs", logical_gpus)
-            except RuntimeError as e:
-                # Virtual devices must be set before GPUs have been initialized
-                print(e)
-
-
-@require_tf
-class TFCoreModelTesterMixin:
-    model_tester = None
-    all_model_classes = ()
-    all_generative_model_classes = ()
-    test_mismatched_shapes = True
-    test_resize_embeddings = True
-    test_head_masking = True
-    is_encoder_decoder = False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-            inputs_dict = {
-                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
-                if isinstance(v, tf.Tensor) and v.ndim > 0
-                else v
-                for k, v in inputs_dict.items()
-            }
-
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
-                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
-                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
-                )
-        return inputs_dict
-
-    @slow
-    def test_graph_mode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes[:2]:
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-
-            @tf.function
-            def run_in_graph_mode():
-                return model(inputs)
-
-            outputs = run_in_graph_mode()
-            self.assertIsNotNone(outputs)
-
-    @slow
-    def test_xla_mode(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes[:2]:
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-
-            @tf.function(experimental_compile=True)
-            def run_in_graph_mode():
-                return model(inputs)
-
-            outputs = run_in_graph_mode()
-            self.assertIsNotNone(outputs)
-
-    @slow
-    def test_xla_fit(self):
-        # This is a copy of the test_keras_fit method, but we use XLA compilation instead of eager
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes[:2]:
-            model = model_class(config)
-            if getattr(model, "hf_compute_loss", None):
-                # Test that model correctly compute the loss with kwargs
-                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
-                # Is there a better way to remove these decoder inputs?
-                prepared_for_class = {
-                    key: val
-                    for key, val in prepared_for_class.items()
-                    if key not in ("head_mask", "decoder_head_mask", "cross_attn_head_mask", "decoder_input_ids")
-                }
-
-                possible_label_cols = {
-                    "labels",
-                    "label",
-                    "label_ids",
-                    "start_positions",
-                    "start_position",
-                    "end_positions",
-                    "end_position",
-                    "next_sentence_label",
-                }
-                label_names = possible_label_cols.intersection(set(prepared_for_class))
-                self.assertGreater(len(label_names), 0, msg="No matching label names found!")
-                labels = {key: val for key, val in prepared_for_class.items() if key in label_names}
-                inputs_minus_labels = {key: val for key, val in prepared_for_class.items() if key not in label_names}
-                self.assertGreater(len(inputs_minus_labels), 0)
-
-                # Make sure it works with XLA!
-                model.compile(optimizer=tf.keras.optimizers.SGD(0.0), jit_compile=True)
-                # Make sure the model fits without crashing regardless of where we pass the labels
-                history = model.fit(
-                    prepared_for_class,
-                    validation_data=prepared_for_class,
-                    steps_per_epoch=1,
-                    validation_steps=1,
-                    shuffle=False,
-                    verbose=0,
-                )
-                loss = history.history["loss"][0]
-                self.assertTrue(not isnan(loss))
-                val_loss = history.history["val_loss"][0]
-                self.assertTrue(not isnan(val_loss))
-
-                # Now test it with separate labels, to make sure that path works in XLA too.
-                model = model_class(config)
-                model.compile(optimizer=tf.keras.optimizers.SGD(0.0), jit_compile=True)
-                history = model.fit(
-                    inputs_minus_labels,
-                    labels,
-                    validation_data=(inputs_minus_labels, labels),
-                    steps_per_epoch=1,
-                    validation_steps=1,
-                    shuffle=False,
-                    verbose=0,
-                )
-
-                loss = history.history["loss"][0]
-                self.assertTrue(not isnan(loss))
-                val_loss = history.history["val_loss"][0]
-                self.assertTrue(not isnan(val_loss))
-
-    @slow
-    def test_saved_model_creation_extended(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes[:2]:
-            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            model = model_class(config)
-            model.build()
-            num_out = len(model(class_inputs_dict))
-
-            for key in list(class_inputs_dict.keys()):
-                # Remove keys not in the serving signature, as the SavedModel will not be compiled to deal with them
-                if key not in model.input_signature:
-                    del class_inputs_dict[key]
-                # Check it's a tensor, in case the inputs dict has some bools in it too
-                elif isinstance(class_inputs_dict[key], tf.Tensor) and class_inputs_dict[key].dtype.is_integer:
-                    class_inputs_dict[key] = tf.cast(class_inputs_dict[key], tf.int32)
-
-            if set(class_inputs_dict.keys()) != set(model.input_signature.keys()):
-                continue  # Some models have inputs that the preparation functions don't create, we skip those
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname, saved_model=True)
-                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
-                model = tf.keras.models.load_model(saved_model_dir)
-                outputs = model(class_inputs_dict)
-
-                if self.is_encoder_decoder:
-                    output_hidden_states = outputs["encoder_hidden_states"]
-                    output_attentions = outputs["encoder_attentions"]
-                else:
-                    output_hidden_states = outputs["hidden_states"]
-                    output_attentions = outputs["attentions"]
-
-                self.assertEqual(len(outputs), num_out)
-
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                self.assertEqual(len(output_hidden_states), expected_num_layers)
-                self.assertListEqual(
-                    list(output_hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size],
-                )
-
-                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(output_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    @slow
-    def test_mixed_precision(self):
-        tf.keras.mixed_precision.set_global_policy("mixed_float16")
-
-        # try/finally block to ensure subsequent tests run in float32
-        try:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            for model_class in self.all_model_classes[:2]:
-                class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                model = model_class(config)
-                outputs = model(class_inputs_dict)
-
-                self.assertIsNotNone(outputs)
-        finally:
-            tf.keras.mixed_precision.set_global_policy("float32")
-
-    @slow
-    def test_train_pipeline_custom_model(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        # head_mask and decoder_head_mask has different shapes than other input args
-        if "head_mask" in inputs_dict:
-            del inputs_dict["head_mask"]
-        if "decoder_head_mask" in inputs_dict:
-            del inputs_dict["decoder_head_mask"]
-        if "cross_attn_head_mask" in inputs_dict:
-            del inputs_dict["cross_attn_head_mask"]
-        tf_main_layer_classes = {
-            module_member
-            for model_class in self.all_model_classes
-            for module in (import_module(model_class.__module__),)
-            for module_member_name in dir(module)
-            if module_member_name.endswith("MainLayer")
-            for module_member in (getattr(module, module_member_name),)
-            if isinstance(module_member, type)
-            and tf.keras.layers.Layer in module_member.__bases__
-            and getattr(module_member, "_keras_serializable", False)
-        }
-
-        for main_layer_class in tf_main_layer_classes:
-            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
-            if "T5" in main_layer_class.__name__:
-                # Take the same values than in TFT5ModelTester for this shared layer
-                shared = TFSharedEmbeddings(self.model_tester.vocab_size, self.model_tester.hidden_size, name="shared")
-                config.use_cache = False
-                main_layer = main_layer_class(config, embed_tokens=shared)
-            else:
-                main_layer = main_layer_class(config)
-
-            symbolic_inputs = {
-                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
-            }
-
-            if hasattr(self.model_tester, "num_labels"):
-                num_labels = self.model_tester.num_labels
-            else:
-                num_labels = 2
-
-            X = tf.data.Dataset.from_tensor_slices(
-                (inputs_dict, np.ones((self.model_tester.batch_size, self.model_tester.seq_length, num_labels, 1)))
-            ).batch(1)
-
-            hidden_states = main_layer(symbolic_inputs)[0]
-            outputs = tf.keras.layers.Dense(num_labels, activation="softmax", name="outputs")(hidden_states)
-            model = tf.keras.models.Model(inputs=symbolic_inputs, outputs=[outputs])
-
-            model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"])
-            model.fit(X, epochs=1)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                filepath = os.path.join(tmpdirname, "keras_model.h5")
-                model.save(filepath)
-                if "T5" in main_layer_class.__name__:
-                    model = tf.keras.models.load_model(
-                        filepath,
-                        custom_objects={
-                            main_layer_class.__name__: main_layer_class,
-                            "TFSharedEmbeddings": TFSharedEmbeddings,
-                        },
-                    )
-                else:
-                    model = tf.keras.models.load_model(
-                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
-                    )
-                assert isinstance(model, tf.keras.Model)
-                model(inputs_dict)
-
-    @slow
-    def test_graph_mode_with_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[:2]:
-            model = model_class(config)
-
-            inputs = copy.deepcopy(inputs_dict)
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
-            else:
-                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
-
-            inputs = self._prepare_for_class(inputs, model_class)
-
-            @tf.function
-            def run_in_graph_mode():
-                return model(inputs)
-
-            outputs = run_in_graph_mode()
-            self.assertIsNotNone(outputs)
-
-    def _generate_random_bad_tokens(self, num_bad_tokens, model):
-        # special tokens cannot be bad tokens
-        special_tokens = []
-        if model.config.bos_token_id is not None:
-            special_tokens.append(model.config.bos_token_id)
-        if model.config.pad_token_id is not None:
-            special_tokens.append(model.config.pad_token_id)
-        if model.config.eos_token_id is not None:
-            special_tokens.append(model.config.eos_token_id)
-
-        # create random bad tokens that are not special tokens
-        bad_tokens = []
-        while len(bad_tokens) < num_bad_tokens:
-            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
-            if token not in special_tokens:
-                bad_tokens.append(token)
-        return bad_tokens
-
-    def _check_generated_ids(self, output_ids):
-        for token_id in output_ids[0].numpy().tolist():
-            self.assertGreaterEqual(token_id, 0)
-            self.assertLess(token_id, self.model_tester.vocab_size)
-
-    def _check_match_tokens(self, generated_ids, bad_words_ids):
-        # for all bad word tokens
-        for bad_word_ids in bad_words_ids:
-            # for all slices in batch
-            for generated_ids_slice in generated_ids:
-                # for all word idx
-                for i in range(len(bad_word_ids), len(generated_ids_slice)):
-                    # if tokens match
-                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
-                        return True
-        return False
diff --git a/tests/transformers/tests/utils/test_offline.py b/tests/transformers/tests/utils/test_offline.py
deleted file mode 100644
index ecc7938bf3..0000000000
--- a/tests/transformers/tests/utils/test_offline.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import subprocess
-import sys
-
-from transformers import BertConfig, BertModel, BertTokenizer, pipeline
-from transformers.testing_utils import TestCasePlus, require_torch
-
-
-class OfflineTests(TestCasePlus):
-    @require_torch
-    def test_offline_mode(self):
-        # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
-        # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
-        # while running an external program
-
-        # python one-liner segments
-
-        # this must be loaded before socket.socket is monkey-patched
-        load = """
-from transformers import BertConfig, BertModel, BertTokenizer, pipeline
-        """
-
-        run = """
-mname = "hf-internal-testing/tiny-random-bert"
-BertConfig.from_pretrained(mname)
-BertModel.from_pretrained(mname)
-BertTokenizer.from_pretrained(mname)
-pipe = pipeline(task="fill-mask", model=mname)
-print("success")
-        """
-
-        mock = """
-import socket
-def offline_socket(*args, **kwargs): raise RuntimeError("Offline mode is enabled, we shouldn't access internet")
-socket.socket = offline_socket
-        """
-
-        # Force fetching the files so that we can use the cache
-        mname = "hf-internal-testing/tiny-random-bert"
-        BertConfig.from_pretrained(mname)
-        BertModel.from_pretrained(mname)
-        BertTokenizer.from_pretrained(mname)
-        pipeline(task="fill-mask", model=mname)
-
-        # baseline - just load from_pretrained with normal network
-        cmd = [sys.executable, "-c", "\n".join([load, run, mock])]
-
-        # should succeed
-        env = self.get_env()
-        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
-        env["TRANSFORMERS_OFFLINE"] = "1"
-        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        self.assertEqual(result.returncode, 0, result.stderr)
-        self.assertIn("success", result.stdout.decode())
-
-    @require_torch
-    def test_offline_mode_no_internet(self):
-        # python one-liner segments
-        # this must be loaded before socket.socket is monkey-patched
-        load = """
-from transformers import BertConfig, BertModel, BertTokenizer, pipeline
-        """
-
-        run = """
-mname = "hf-internal-testing/tiny-random-bert"
-BertConfig.from_pretrained(mname)
-BertModel.from_pretrained(mname)
-BertTokenizer.from_pretrained(mname)
-pipe = pipeline(task="fill-mask", model=mname)
-print("success")
-        """
-
-        mock = """
-import socket
-def offline_socket(*args, **kwargs): raise socket.error("Faking flaky internet")
-socket.socket = offline_socket
-        """
-
-        # Force fetching the files so that we can use the cache
-        mname = "hf-internal-testing/tiny-random-bert"
-        BertConfig.from_pretrained(mname)
-        BertModel.from_pretrained(mname)
-        BertTokenizer.from_pretrained(mname)
-        pipeline(task="fill-mask", model=mname)
-
-        # baseline - just load from_pretrained with normal network
-        cmd = [sys.executable, "-c", "\n".join([load, run, mock])]
-
-        # should succeed
-        env = self.get_env()
-        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        self.assertEqual(result.returncode, 0, result.stderr)
-        self.assertIn("success", result.stdout.decode())
-
-    @require_torch
-    def test_offline_mode_sharded_checkpoint(self):
-        # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
-        # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
-        # while running an external program
-
-        # python one-liner segments
-
-        # this must be loaded before socket.socket is monkey-patched
-        load = """
-from transformers import BertConfig, BertModel, BertTokenizer
-        """
-
-        run = """
-mname = "hf-internal-testing/tiny-random-bert-sharded"
-BertConfig.from_pretrained(mname)
-BertModel.from_pretrained(mname)
-print("success")
-        """
-
-        mock = """
-import socket
-def offline_socket(*args, **kwargs): raise ValueError("Offline mode is enabled")
-socket.socket = offline_socket
-        """
-
-        # baseline - just load from_pretrained with normal network
-        cmd = [sys.executable, "-c", "\n".join([load, run])]
-
-        # should succeed
-        env = self.get_env()
-        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        self.assertEqual(result.returncode, 0, result.stderr)
-        self.assertIn("success", result.stdout.decode())
-
-        # next emulate no network
-        cmd = [sys.executable, "-c", "\n".join([load, mock, run])]
-
-        # Doesn't fail anymore since the model is in the cache due to other tests, so commenting this.
-        # env["TRANSFORMERS_OFFLINE"] = "0"
-        # result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        # self.assertEqual(result.returncode, 1, result.stderr)
-
-        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
-        env["TRANSFORMERS_OFFLINE"] = "1"
-        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        self.assertEqual(result.returncode, 0, result.stderr)
-        self.assertIn("success", result.stdout.decode())
-
-    @require_torch
-    def test_offline_mode_pipeline_exception(self):
-        load = """
-from transformers import pipeline
-        """
-        run = """
-mname = "hf-internal-testing/tiny-random-bert"
-pipe = pipeline(model=mname)
-        """
-
-        mock = """
-import socket
-def offline_socket(*args, **kwargs): raise socket.error("Offline mode is enabled")
-socket.socket = offline_socket
-        """
-        env = self.get_env()
-        env["TRANSFORMERS_OFFLINE"] = "1"
-        cmd = [sys.executable, "-c", "\n".join([load, mock, run])]
-        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        self.assertEqual(result.returncode, 1, result.stderr)
-        self.assertIn(
-            "You cannot infer task automatically within `pipeline` when using offline mode",
-            result.stderr.decode().replace("\n", ""),
-        )
-
-    @require_torch
-    def test_offline_model_dynamic_model(self):
-        load = """
-from transformers import AutoModel
-        """
-        run = """
-mname = "hf-internal-testing/test_dynamic_model"
-AutoModel.from_pretrained(mname, trust_remote_code=True)
-print("success")
-        """
-
-        # baseline - just load from_pretrained with normal network
-        cmd = [sys.executable, "-c", "\n".join([load, run])]
-
-        # should succeed
-        env = self.get_env()
-        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        self.assertEqual(result.returncode, 0, result.stderr)
-        self.assertIn("success", result.stdout.decode())
-
-        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
-        env["TRANSFORMERS_OFFLINE"] = "1"
-        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        self.assertEqual(result.returncode, 0, result.stderr)
-        self.assertIn("success", result.stdout.decode())
diff --git a/tests/transformers/tests/utils/test_skip_decorators.py b/tests/transformers/tests/utils/test_skip_decorators.py
deleted file mode 100644
index 94a870e656..0000000000
--- a/tests/transformers/tests/utils/test_skip_decorators.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-#
-# this test validates that we can stack skip decorators in groups and whether
-# they work correctly with other decorators
-#
-# since the decorators have already built their decision params (like checking
-# env[], we can't mock the env and test each of the combinations), so ideally
-# the following 4 should be run. But since we have different CI jobs running
-# different configs, all combinations should get covered
-#
-# RUN_SLOW=1 pytest -rA tests/test_skip_decorators.py
-# RUN_SLOW=1 CUDA_VISIBLE_DEVICES="" pytest -rA tests/test_skip_decorators.py
-# RUN_SLOW=0 pytest -rA tests/test_skip_decorators.py
-# RUN_SLOW=0 CUDA_VISIBLE_DEVICES="" pytest -rA tests/test_skip_decorators.py
-
-import os
-import unittest
-
-import pytest
-from parameterized import parameterized
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
-
-
-# skipping in unittest tests
-
-params = [(1,)]
-
-
-# test that we can stack our skip decorators with 3rd party decorators
-def check_slow():
-    run_slow = bool(os.getenv("RUN_SLOW", 0))
-    if run_slow:
-        assert True
-    else:
-        assert False, "should have been skipped"
-
-
-# test that we can stack our skip decorators
-def check_slow_torch_cuda():
-    run_slow = bool(os.getenv("RUN_SLOW", 0))
-    if run_slow and torch_device == "cuda":
-        assert True
-    else:
-        assert False, "should have been skipped"
-
-
-@require_torch
-class SkipTester(unittest.TestCase):
-    @slow
-    @require_torch_gpu
-    def test_2_skips_slow_first(self):
-        check_slow_torch_cuda()
-
-    @require_torch_gpu
-    @slow
-    def test_2_skips_slow_last(self):
-        check_slow_torch_cuda()
-
-    # The combination of any skip decorator, followed by parameterized fails to skip the tests
-    # 1. @slow manages to correctly skip `test_param_slow_first`
-    # 2. but then `parameterized` creates new tests, with a unique name for each parameter groups.
-    #    It has no idea that they are to be skipped and so they all run, ignoring @slow
-    # Therefore skip decorators must come after `parameterized`
-    #
-    # @slow
-    # @parameterized.expand(params)
-    # def test_param_slow_first(self, param=None):
-    #     check_slow()
-
-    # This works as expected:
-    # 1. `parameterized` creates new tests with unique names
-    # 2. each of them gets an opportunity to be skipped
-    @parameterized.expand(params)
-    @slow
-    def test_param_slow_last(self, param=None):
-        check_slow()
-
-
-# skipping in non-unittest tests
-# no problem at all here
-
-
-@slow
-@require_torch_gpu
-def test_pytest_2_skips_slow_first():
-    check_slow_torch_cuda()
-
-
-@require_torch_gpu
-@slow
-def test_pytest_2_skips_slow_last():
-    check_slow_torch_cuda()
-
-
-@slow
-@pytest.mark.parametrize("param", [1])
-def test_pytest_param_slow_first(param):
-    check_slow()
-
-
-@pytest.mark.parametrize("param", [1])
-@slow
-def test_pytest_param_slow_last(param):
-    check_slow()
diff --git a/tests/transformers/tests/utils/test_versions_utils.py b/tests/transformers/tests/utils/test_versions_utils.py
deleted file mode 100644
index 14839400c2..0000000000
--- a/tests/transformers/tests/utils/test_versions_utils.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import importlib.metadata
-import sys
-
-from transformers.testing_utils import TestCasePlus
-from transformers.utils.versions import require_version, require_version_core
-
-
-numpy_ver = importlib.metadata.version("numpy")
-python_ver = ".".join([str(x) for x in sys.version_info[:3]])
-
-
-class DependencyVersionCheckTest(TestCasePlus):
-    def test_core(self):
-        # lt + different version strings
-        require_version_core("numpy<1000.4.5")
-        require_version_core("numpy<1000.4")
-        require_version_core("numpy<1000")
-
-        # le
-        require_version_core("numpy<=1000.4.5")
-        require_version_core(f"numpy<={numpy_ver}")
-
-        # eq
-        require_version_core(f"numpy=={numpy_ver}")
-
-        # ne
-        require_version_core("numpy!=1000.4.5")
-
-        # ge
-        require_version_core("numpy>=1.0")
-        require_version_core("numpy>=1.0.0")
-        require_version_core(f"numpy>={numpy_ver}")
-
-        # gt
-        require_version_core("numpy>1.0.0")
-
-        # mix
-        require_version_core("numpy>1.0.0,<1000")
-
-        # requirement w/o version
-        require_version_core("numpy")
-
-        # unmet requirements due to version conflict
-        for req in ["numpy==1.0.0", "numpy>=1000.0.0", f"numpy<{numpy_ver}"]:
-            try:
-                require_version_core(req)
-            except ImportError as e:
-                self.assertIn(f"{req} is required", str(e))
-                self.assertIn("but found", str(e))
-
-        # unmet requirements due to missing module
-        for req in ["numpipypie>1", "numpipypie2"]:
-            try:
-                require_version_core(req)
-            except importlib.metadata.PackageNotFoundError as e:
-                self.assertIn(f"The '{req}' distribution was not found and is required by this application", str(e))
-                self.assertIn("Try: pip install transformers -U", str(e))
-
-        # bogus requirements formats:
-        # 1. whole thing
-        for req in ["numpy??1.0.0", "numpy1.0.0"]:
-            try:
-                require_version_core(req)
-            except ValueError as e:
-                self.assertIn("requirement needs to be in the pip package format", str(e))
-        # 2. only operators
-        for req in ["numpy=1.0.0", "numpy == 1.00", "numpy<>1.0.0", "numpy><1.00", "numpy>>1.0.0"]:
-            try:
-                require_version_core(req)
-            except ValueError as e:
-                self.assertIn("need one of ", str(e))
-
-    def test_python(self):
-        # matching requirement
-        require_version("python>=3.6.0")
-
-        # not matching requirements
-        for req in ["python>9.9.9", "python<3.0.0"]:
-            try:
-                require_version_core(req)
-            except ImportError as e:
-                self.assertIn(f"{req} is required", str(e))
-                self.assertIn(f"but found python=={python_ver}", str(e))
diff --git a/tests/transformers/tests/utils/tiny_model_summary.json b/tests/transformers/tests/utils/tiny_model_summary.json
deleted file mode 100644
index b7fdf87bac..0000000000
--- a/tests/transformers/tests/utils/tiny_model_summary.json
+++ /dev/null
@@ -1,6924 +0,0 @@
-{
-    "ASTForAudioClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ASTFeatureExtractor"
-        ],
-        "model_classes": [
-            "ASTForAudioClassification"
-        ],
-        "sha": "83d6e076db7768a3645401bad3204624985e1d08"
-    },
-    "ASTModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ASTFeatureExtractor"
-        ],
-        "model_classes": [
-            "ASTModel"
-        ],
-        "sha": "75e68f956f6f2c0709b01e596e7a6aecb1b29dce"
-    },
-    "AlbertForMaskedLM": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "AlbertForMaskedLM",
-            "TFAlbertForMaskedLM"
-        ],
-        "sha": "d29de71ac29e1019c3a7762f7357f750730cb037"
-    },
-    "AlbertForMultipleChoice": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "AlbertForMultipleChoice",
-            "TFAlbertForMultipleChoice"
-        ],
-        "sha": "242aecce6a589a2964c0f695621fa22a83751579"
-    },
-    "AlbertForPreTraining": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "AlbertForPreTraining",
-            "TFAlbertForPreTraining"
-        ],
-        "sha": "41330be4b271687f4d88ddc96346c12aa11de983"
-    },
-    "AlbertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "AlbertForQuestionAnswering",
-            "TFAlbertForQuestionAnswering"
-        ],
-        "sha": "040b81c15f437f4722349dc5b41fccd17ebd7fdc"
-    },
-    "AlbertForSequenceClassification": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "AlbertForSequenceClassification",
-            "TFAlbertForSequenceClassification"
-        ],
-        "sha": "39c1a0e2c1c2623106d3211d751e9b32f23a91a0"
-    },
-    "AlbertForTokenClassification": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "AlbertForTokenClassification",
-            "TFAlbertForTokenClassification"
-        ],
-        "sha": "359c3f4a311a4053a6f6d6a880db5f82c8e3ff1f"
-    },
-    "AlbertModel": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "AlbertModel",
-            "TFAlbertModel"
-        ],
-        "sha": "34a63314686b64aaeb595ddb95006f1ff2ffda17"
-    },
-    "AlignModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "EfficientNetImageProcessor"
-        ],
-        "model_classes": [
-            "AlignModel"
-        ],
-        "sha": "68a4f9d3f493f44efa7c1dde6fcca23350e2c92b"
-    },
-    "AltCLIPModel": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [
-            "CLIPImageProcessor"
-        ],
-        "model_classes": [
-            "AltCLIPModel"
-        ],
-        "sha": "3106af0fd503970717c05f27218e5cacf19ba872"
-    },
-    "BartForCausalLM": {
-        "tokenizer_classes": [
-            "BartTokenizer",
-            "BartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BartForCausalLM"
-        ],
-        "sha": "c25526ac67d2dbe79fe5462af4b7908ca2fbc3ff"
-    },
-    "BartForConditionalGeneration": {
-        "tokenizer_classes": [
-            "BartTokenizer",
-            "BartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BartForConditionalGeneration",
-            "TFBartForConditionalGeneration"
-        ],
-        "sha": "3a489a21e4b04705f4a6047924b7616a67be7e37"
-    },
-    "BartForQuestionAnswering": {
-        "tokenizer_classes": [
-            "BartTokenizer",
-            "BartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BartForQuestionAnswering"
-        ],
-        "sha": "3ebf9aab39a57ceab55128d5fc6f61e4db0dadd4"
-    },
-    "BartForSequenceClassification": {
-        "tokenizer_classes": [
-            "BartTokenizer",
-            "BartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BartForSequenceClassification",
-            "TFBartForSequenceClassification"
-        ],
-        "sha": "ea452fd9a928cfebd71723afa50feb20326917bc"
-    },
-    "BartModel": {
-        "tokenizer_classes": [
-            "BartTokenizer",
-            "BartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BartModel",
-            "TFBartModel"
-        ],
-        "sha": "e5df6d1aa75f03833b2df328b9c35463f73a421b"
-    },
-    "BeitForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BeitImageProcessor"
-        ],
-        "model_classes": [
-            "BeitForImageClassification"
-        ],
-        "sha": "e997587bb890f82faad4bd25eb23d85ba21ecaaa"
-    },
-    "BeitForSemanticSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BeitImageProcessor"
-        ],
-        "model_classes": [
-            "BeitForSemanticSegmentation"
-        ],
-        "sha": "d4afa9e21e3fe5b087578ed68974d9b3ffc1fb22"
-    },
-    "BeitModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BeitImageProcessor"
-        ],
-        "model_classes": [
-            "BeitModel"
-        ],
-        "sha": "5c4a051f0cca6f64d02c6168deb88413cae10d2c"
-    },
-    "BertForMaskedLM": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertForMaskedLM",
-            "TFBertForMaskedLM"
-        ],
-        "sha": "3e32baa52ce044c75edfb5c28abd51ee8d051282"
-    },
-    "BertForMultipleChoice": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertForMultipleChoice",
-            "TFBertForMultipleChoice"
-        ],
-        "sha": "0b8c3a6d411d1e19e5fd98d4d8631ae7616eeeaa"
-    },
-    "BertForNextSentencePrediction": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertForNextSentencePrediction",
-            "TFBertForNextSentencePrediction"
-        ],
-        "sha": "628e70debf8864bd0b63aff7901d17d9c4f7612c"
-    },
-    "BertForPreTraining": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertForPreTraining",
-            "TFBertForPreTraining"
-        ],
-        "sha": "c748ad37e6a200a6f64b2764191bfe13f976032f"
-    },
-    "BertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertForQuestionAnswering",
-            "TFBertForQuestionAnswering"
-        ],
-        "sha": "4671ad0c21493b97c5eb2f0201192704c29876d5"
-    },
-    "BertForSequenceClassification": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertForSequenceClassification",
-            "TFBertForSequenceClassification"
-        ],
-        "sha": "37a9d44022264c12bdf3ec257778f953b63d4aaf"
-    },
-    "BertForTokenClassification": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertForTokenClassification",
-            "TFBertForTokenClassification"
-        ],
-        "sha": "d7dc3a0793ff6dfcb794b21130ee0f185d2c61a2"
-    },
-    "BertLMHeadModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertLMHeadModel",
-            "TFBertLMHeadModel"
-        ],
-        "sha": "b4e3acc1990f3e365ffddbd54b620a26d9fb4b09"
-    },
-    "BertModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BertModel",
-            "TFBertModel"
-        ],
-        "sha": "3956d303d3cddf0708ff20660c1ea5f6ec30e434"
-    },
-    "BigBirdForCausalLM": {
-        "tokenizer_classes": [
-            "BigBirdTokenizer",
-            "BigBirdTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdForCausalLM"
-        ],
-        "sha": "5c7a487af5248d9c01b45d5481b7d7bb9b36e1b5"
-    },
-    "BigBirdForMaskedLM": {
-        "tokenizer_classes": [
-            "BigBirdTokenizer",
-            "BigBirdTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdForMaskedLM"
-        ],
-        "sha": "476ef8225c0f69270b577706ad4f1dda13e4dde5"
-    },
-    "BigBirdForMultipleChoice": {
-        "tokenizer_classes": [
-            "BigBirdTokenizer",
-            "BigBirdTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdForMultipleChoice"
-        ],
-        "sha": "cf93eaa1019987112c171a407745bc183a20513a"
-    },
-    "BigBirdForPreTraining": {
-        "tokenizer_classes": [
-            "BigBirdTokenizer",
-            "BigBirdTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdForPreTraining"
-        ],
-        "sha": "5fb9efa13334431e7c186a9fa314b89c4a1eee72"
-    },
-    "BigBirdForQuestionAnswering": {
-        "tokenizer_classes": [
-            "BigBirdTokenizer",
-            "BigBirdTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdForQuestionAnswering"
-        ],
-        "sha": "f82f88bd71fba819a8ffb0692915d3529e705417"
-    },
-    "BigBirdForSequenceClassification": {
-        "tokenizer_classes": [
-            "BigBirdTokenizer",
-            "BigBirdTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdForSequenceClassification"
-        ],
-        "sha": "ea398090858f9af93b54fc9a8d65cfed78ac27ff"
-    },
-    "BigBirdForTokenClassification": {
-        "tokenizer_classes": [
-            "BigBirdTokenizer",
-            "BigBirdTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdForTokenClassification"
-        ],
-        "sha": "2cdea118999fa58ba9fb0162d99e2ffa146c3df1"
-    },
-    "BigBirdModel": {
-        "tokenizer_classes": [
-            "BigBirdTokenizer",
-            "BigBirdTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdModel"
-        ],
-        "sha": "9c55989f31df156194e6997606fb14d9897e0300"
-    },
-    "BigBirdPegasusForCausalLM": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdPegasusForCausalLM"
-        ],
-        "sha": "49bc8816c666dee32e27cd8e00136b604eb85243"
-    },
-    "BigBirdPegasusForConditionalGeneration": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdPegasusForConditionalGeneration"
-        ],
-        "sha": "e791aa6d1af5a76ca0926d95b1f28bd2d8adf376"
-    },
-    "BigBirdPegasusForQuestionAnswering": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdPegasusForQuestionAnswering"
-        ],
-        "sha": "7650e076713ca707a37062adc8c9c1cd60dad7c7"
-    },
-    "BigBirdPegasusForSequenceClassification": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdPegasusForSequenceClassification"
-        ],
-        "sha": "02500e8ebd9c53528750013fb963fbdc2be34034"
-    },
-    "BigBirdPegasusModel": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BigBirdPegasusModel"
-        ],
-        "sha": "b07c5304dfba673cf8b9cf5cd1aa45fbfea1c2f3"
-    },
-    "BioGptForCausalLM": {
-        "tokenizer_classes": [
-            "BioGptTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BioGptForCausalLM"
-        ],
-        "sha": "07073b31da84054fd12226e3cae4cb3beb2547f9"
-    },
-    "BioGptForSequenceClassification": {
-        "tokenizer_classes": [
-            "BioGptTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BioGptForSequenceClassification"
-        ],
-        "sha": "8e18ad6218abd795e050dec324a8c827ccedacb4"
-    },
-    "BioGptForTokenClassification": {
-        "tokenizer_classes": [
-            "BioGptTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BioGptForTokenClassification"
-        ],
-        "sha": "67f8173c1a17273064d452a9031a51b67f327b6a"
-    },
-    "BioGptModel": {
-        "tokenizer_classes": [
-            "BioGptTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BioGptModel"
-        ],
-        "sha": "fe18551d0743538a990520b75707294ec57b4ebe"
-    },
-    "BitBackbone": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "BitBackbone"
-        ],
-        "sha": "2f06f6b4395b6dce2b00ac839ff757410e743cd7"
-    },
-    "BitForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "BitForImageClassification"
-        ],
-        "sha": "d0d8476f2d285ddda7c42c0d4a8e4bf6f5d2bfdf"
-    },
-    "BitModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "BitModel"
-        ],
-        "sha": "30a8a9b1a6b253cc500c01cf41bc1fc9581ea5e5"
-    },
-    "BlenderbotForCausalLM": {
-        "tokenizer_classes": [
-            "BlenderbotTokenizer",
-            "BlenderbotTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BlenderbotForCausalLM"
-        ],
-        "sha": "8aad2e13e8920bca3cf988ba45f8a7b008b51a81"
-    },
-    "BlenderbotForConditionalGeneration": {
-        "tokenizer_classes": [
-            "BlenderbotTokenizer",
-            "BlenderbotTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BlenderbotForConditionalGeneration",
-            "TFBlenderbotForConditionalGeneration"
-        ],
-        "sha": "e8532878b9924fa02fb4b059b7f6e7fa372fff91"
-    },
-    "BlenderbotModel": {
-        "tokenizer_classes": [
-            "BlenderbotTokenizer",
-            "BlenderbotTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BlenderbotModel",
-            "TFBlenderbotModel"
-        ],
-        "sha": "ff848a40c30ca98eb7c6870bbb02677d5af9db55"
-    },
-    "BlenderbotSmallForCausalLM": {
-        "tokenizer_classes": [
-            "BlenderbotSmallTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BlenderbotSmallForCausalLM"
-        ],
-        "sha": "4c57c106630932eb9de4d76210a540d04616304d"
-    },
-    "BlenderbotSmallForConditionalGeneration": {
-        "tokenizer_classes": [
-            "BlenderbotSmallTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BlenderbotSmallForConditionalGeneration",
-            "TFBlenderbotSmallForConditionalGeneration"
-        ],
-        "sha": "b8db01fcf3e37a5b369cd50e169bf383b8e905d8"
-    },
-    "BlenderbotSmallModel": {
-        "tokenizer_classes": [
-            "BlenderbotSmallTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BlenderbotSmallModel",
-            "TFBlenderbotSmallModel"
-        ],
-        "sha": "0a10c70e225ec63278faffa8fabf759f063f0e55"
-    },
-    "Blip2ForConditionalGeneration": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [
-            "BlipImageProcessor"
-        ],
-        "model_classes": [
-            "Blip2ForConditionalGeneration"
-        ],
-        "sha": "35e1ef43da3554af62eb29a7b3dbbef3f3bef48e"
-    },
-    "Blip2Model": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [
-            "BlipImageProcessor"
-        ],
-        "model_classes": [
-            "Blip2Model"
-        ],
-        "sha": "c23378f225be31872fff33c103cf0ebc2454ffcc"
-    },
-    "BlipForConditionalGeneration": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "BlipImageProcessor"
-        ],
-        "model_classes": [
-            "BlipForConditionalGeneration",
-            "TFBlipForConditionalGeneration"
-        ],
-        "sha": "eaf32bc0369349deef0c777442fc185119171d1f"
-    },
-    "BlipModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "BlipImageProcessor"
-        ],
-        "model_classes": [
-            "BlipModel",
-            "TFBlipModel"
-        ],
-        "sha": "3d1d1c15eff22d6b2664a2d15757fa6f5d93827d"
-    },
-    "BloomForCausalLM": {
-        "tokenizer_classes": [
-            "BloomTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BloomForCausalLM"
-        ],
-        "sha": "0f4f06f162cd67d34d03ee156484e4001d468500"
-    },
-    "BloomForQuestionAnswering": {
-        "tokenizer_classes": [
-            "BloomTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BloomForQuestionAnswering"
-        ],
-        "sha": "23f369f163eef8c9c9685900440b0cbb0f3439fd"
-    },
-    "BloomForSequenceClassification": {
-        "tokenizer_classes": [
-            "BloomTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BloomForSequenceClassification"
-        ],
-        "sha": "b2280eef7172835f39b265eb0c46623257f67bbe"
-    },
-    "BloomForTokenClassification": {
-        "tokenizer_classes": [
-            "BloomTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BloomForTokenClassification"
-        ],
-        "sha": "9796aa45f99adff987c978089e11c0bd9d7b997f"
-    },
-    "BloomModel": {
-        "tokenizer_classes": [
-            "BloomTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "BloomModel"
-        ],
-        "sha": "28b600fcfdc4f4938406fb518abf895620048cb2"
-    },
-    "CLIPModel": {
-        "tokenizer_classes": [
-            "CLIPTokenizer",
-            "CLIPTokenizerFast"
-        ],
-        "processor_classes": [
-            "CLIPImageProcessor"
-        ],
-        "model_classes": [
-            "CLIPModel",
-            "TFCLIPModel"
-        ],
-        "sha": "0452d344074485d0e7eb5d5c12447b7c9dbc9619"
-    },
-    "CLIPSegModel": {
-        "tokenizer_classes": [
-            "CLIPTokenizer",
-            "CLIPTokenizerFast"
-        ],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "CLIPSegModel"
-        ],
-        "sha": "7b1305214ccc85d29b776ffbee06748693852a04"
-    },
-    "CTRLForSequenceClassification": {
-        "tokenizer_classes": [
-            "CTRLTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CTRLForSequenceClassification",
-            "TFCTRLForSequenceClassification"
-        ],
-        "sha": "280b5a3502d607c55c9f8d9f198fe9c2802d6f73"
-    },
-    "CTRLLMHeadModel": {
-        "tokenizer_classes": [
-            "CTRLTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CTRLLMHeadModel",
-            "TFCTRLLMHeadModel"
-        ],
-        "sha": "662381663b216f1dd3c9cd30e2e83cb4c6fc9552"
-    },
-    "CTRLModel": {
-        "tokenizer_classes": [
-            "CTRLTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CTRLModel",
-            "TFCTRLModel"
-        ],
-        "sha": "68b19b4f132d5a191a73acd78d983cbdcf068e9c"
-    },
-    "CanineForMultipleChoice": {
-        "tokenizer_classes": [
-            "CanineTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CanineForMultipleChoice"
-        ],
-        "sha": "fa0451453ed202f903ff7dcf6071aab6630fb89f"
-    },
-    "CanineForQuestionAnswering": {
-        "tokenizer_classes": [
-            "CanineTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CanineForQuestionAnswering"
-        ],
-        "sha": "5e1012bb086ac2e0b1497eeb7ed14eb2183d4ecb"
-    },
-    "CanineForSequenceClassification": {
-        "tokenizer_classes": [
-            "CanineTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CanineForSequenceClassification"
-        ],
-        "sha": "75336dc9179153869c38a8047ce4b1e02677a260"
-    },
-    "CanineForTokenClassification": {
-        "tokenizer_classes": [
-            "CanineTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CanineForTokenClassification"
-        ],
-        "sha": "65a622ea8e12597e12f45e59d46d8dbe8461fc10"
-    },
-    "CanineModel": {
-        "tokenizer_classes": [
-            "CanineTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CanineModel"
-        ],
-        "sha": "531ef67ad4f0b3dc7a9e5d722c774096b7401b1b"
-    },
-    "ChineseCLIPModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "ChineseCLIPImageProcessor"
-        ],
-        "model_classes": [
-            "ChineseCLIPModel"
-        ],
-        "sha": "504271a3c5fd9c2e877f5b4c01848bc18778c7c3"
-    },
-    "ClapModel": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [
-            "ClapFeatureExtractor"
-        ],
-        "model_classes": [
-            "ClapModel"
-        ],
-        "sha": "a7874595b900f9b2ddc79130dafc3ff48f4fbfb9"
-    },
-    "CodeGenForCausalLM": {
-        "tokenizer_classes": [
-            "CodeGenTokenizer",
-            "CodeGenTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CodeGenForCausalLM"
-        ],
-        "sha": "a3fc69d757fd1f0aa01bcbc4337f586651c7cb10"
-    },
-    "CodeGenModel": {
-        "tokenizer_classes": [
-            "CodeGenTokenizer",
-            "CodeGenTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "CodeGenModel"
-        ],
-        "sha": "dad4941a2b7429fc6e8206fcc4a04fc40f4a0beb"
-    },
-    "ConditionalDetrForObjectDetection": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConditionalDetrImageProcessor"
-        ],
-        "model_classes": [
-            "ConditionalDetrForObjectDetection"
-        ],
-        "sha": "762c213a0285edc84eb813a2ed90063cf971ca43"
-    },
-    "ConditionalDetrModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConditionalDetrImageProcessor"
-        ],
-        "model_classes": [
-            "ConditionalDetrModel"
-        ],
-        "sha": "18b75874158cac520c63605293b06e0b1327c263"
-    },
-    "ConvBertForMaskedLM": {
-        "tokenizer_classes": [
-            "ConvBertTokenizer",
-            "ConvBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ConvBertForMaskedLM",
-            "TFConvBertForMaskedLM"
-        ],
-        "sha": "307c70e32c3d3c18aeb45e0cbdc9fcd2957d9aba"
-    },
-    "ConvBertForMultipleChoice": {
-        "tokenizer_classes": [
-            "ConvBertTokenizer",
-            "ConvBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ConvBertForMultipleChoice",
-            "TFConvBertForMultipleChoice"
-        ],
-        "sha": "d6561a21ffdb82d03c1822af0510eb7482ce5026"
-    },
-    "ConvBertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "ConvBertTokenizer",
-            "ConvBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ConvBertForQuestionAnswering",
-            "TFConvBertForQuestionAnswering"
-        ],
-        "sha": "8a056da5cc421415c2a24b9f644dd95ca279411d"
-    },
-    "ConvBertForSequenceClassification": {
-        "tokenizer_classes": [
-            "ConvBertTokenizer",
-            "ConvBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ConvBertForSequenceClassification",
-            "TFConvBertForSequenceClassification"
-        ],
-        "sha": "8bb8b20e51d282d777cc567cacadd97a35f0811e"
-    },
-    "ConvBertForTokenClassification": {
-        "tokenizer_classes": [
-            "ConvBertTokenizer",
-            "ConvBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ConvBertForTokenClassification",
-            "TFConvBertForTokenClassification"
-        ],
-        "sha": "8db0dd3c2b8ccc958fa9a84801f4f837b42fcf2c"
-    },
-    "ConvBertModel": {
-        "tokenizer_classes": [
-            "ConvBertTokenizer",
-            "ConvBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ConvBertModel",
-            "TFConvBertModel"
-        ],
-        "sha": "c9c5b1a74f0e468d8467473cabeaa67fcdbaddb7"
-    },
-    "ConvNextBackbone": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ConvNextBackbone"
-        ],
-        "sha": "499c7d6a97825b79e19663b70f3b60c4813b6bf2"
-    },
-    "ConvNextForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ConvNextForImageClassification",
-            "TFConvNextForImageClassification"
-        ],
-        "sha": "0b490fd6b19cdbf721025dbd6ee45dcc5828e6e3"
-    },
-    "ConvNextModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ConvNextModel",
-            "TFConvNextModel"
-        ],
-        "sha": "7b3b47a57b9a9120e022b91d6067daeac55b794f"
-    },
-    "ConvNextV2Backbone": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ConvNextV2Backbone"
-        ],
-        "sha": "c82fc526949dfd892a1fee3c34be6f8d80c4d3df"
-    },
-    "ConvNextV2ForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ConvNextV2ForImageClassification"
-        ],
-        "sha": "ee22bae1cbb87d66fc7f62f7e15a43d6ff80d3cc"
-    },
-    "ConvNextV2Model": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ConvNextV2Model"
-        ],
-        "sha": "c4dd68ee1102cba05bcc483da2a88e39427b7249"
-    },
-    "CvtForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "CvtForImageClassification",
-            "TFCvtForImageClassification"
-        ],
-        "sha": "4b1938e252fdb26a06c1f5755e07fa8f6eed2d75"
-    },
-    "CvtModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "CvtModel",
-            "TFCvtModel"
-        ],
-        "sha": "27fed12c174f4f4f1fe27075d1c29602fe0669f0"
-    },
-    "DPRQuestionEncoder": {
-        "tokenizer_classes": [
-            "DPRQuestionEncoderTokenizer",
-            "DPRQuestionEncoderTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DPRQuestionEncoder",
-            "TFDPRQuestionEncoder"
-        ],
-        "sha": "09ae0269780271e0a4916f7bab1dbc4f8a76070d"
-    },
-    "DPTForDepthEstimation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DPTImageProcessor"
-        ],
-        "model_classes": [
-            "DPTForDepthEstimation"
-        ],
-        "sha": "11b7735d64d95b6599811631b012d2dec6eaa2c1"
-    },
-    "DPTForSemanticSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DPTImageProcessor"
-        ],
-        "model_classes": [
-            "DPTForSemanticSegmentation"
-        ],
-        "sha": "e140c3c716a4bf11dad875e5f5f0abd2bd4cbbcb"
-    },
-    "DPTModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DPTImageProcessor"
-        ],
-        "model_classes": [
-            "DPTModel"
-        ],
-        "sha": "1d6ae6c0b60868dffbef0dddeda381c51c6dcba5"
-    },
-    "Data2VecAudioForAudioFrameClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Data2VecAudioForAudioFrameClassification"
-        ],
-        "sha": "a64828b27e73fc8dd95aeb315108ca2f6a66b55f"
-    },
-    "Data2VecAudioForCTC": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Data2VecAudioForCTC"
-        ],
-        "sha": "bb161b6a181bd2c22cf30222f46fa6ef42225744"
-    },
-    "Data2VecAudioForSequenceClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Data2VecAudioForSequenceClassification"
-        ],
-        "sha": "8de17e0a959eca5f72b2ea59a11bc1fa744785d9"
-    },
-    "Data2VecAudioForXVector": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Data2VecAudioForXVector"
-        ],
-        "sha": "dcb92484cf28fb4fe1dcf5d6e8d78e04382fdce9"
-    },
-    "Data2VecAudioModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Data2VecAudioModel"
-        ],
-        "sha": "73f503fdff73b7616154f64dbe38a685cc48e8eb"
-    },
-    "Data2VecTextForCausalLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "Data2VecTextForCausalLM"
-        ],
-        "sha": "1f3658ce623653338cd31516551e8181aa08bb38"
-    },
-    "Data2VecTextForMaskedLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "Data2VecTextForMaskedLM"
-        ],
-        "sha": "fb41ac30d0faa0899bf5afaa0986df8993395ca6"
-    },
-    "Data2VecTextForMultipleChoice": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "Data2VecTextForMultipleChoice"
-        ],
-        "sha": "e7556d520ad90ebae5ad88554d45a37488d00040"
-    },
-    "Data2VecTextForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "Data2VecTextForQuestionAnswering"
-        ],
-        "sha": "9630833d76a1fd7e96b904d87bb11b7c00ccd021"
-    },
-    "Data2VecTextForSequenceClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "Data2VecTextForSequenceClassification"
-        ],
-        "sha": "156e4019c37d9592f193ba80553cd245cbccecb3"
-    },
-    "Data2VecTextForTokenClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "Data2VecTextForTokenClassification"
-        ],
-        "sha": "55b3a49fdbf22479d6eb939261d4b884ea288270"
-    },
-    "Data2VecTextModel": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "Data2VecTextModel"
-        ],
-        "sha": "c21be3e4f88e8357bf33bfba8f8e05ae2e735124"
-    },
-    "Data2VecVisionForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BeitImageProcessor"
-        ],
-        "model_classes": [
-            "Data2VecVisionForImageClassification",
-            "TFData2VecVisionForImageClassification"
-        ],
-        "sha": "d640e7ced7a3fbbb8c8661a4f67b934e55406172"
-    },
-    "Data2VecVisionForSemanticSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BeitImageProcessor"
-        ],
-        "model_classes": [
-            "Data2VecVisionForSemanticSegmentation",
-            "TFData2VecVisionForSemanticSegmentation"
-        ],
-        "sha": "3eba3cd694fab6530b7e5da8f49d3951301c816a"
-    },
-    "Data2VecVisionModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BeitImageProcessor"
-        ],
-        "model_classes": [
-            "Data2VecVisionModel",
-            "TFData2VecVisionModel"
-        ],
-        "sha": "2a7ad25e4359970dc70494a2f3eb98e2a3c9806d"
-    },
-    "DebertaForMaskedLM": {
-        "tokenizer_classes": [
-            "DebertaTokenizer",
-            "DebertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaForMaskedLM",
-            "TFDebertaForMaskedLM"
-        ],
-        "sha": "e0f9ada9e0f6d4d7cc39d7cbd58369b0c84de33d"
-    },
-    "DebertaForQuestionAnswering": {
-        "tokenizer_classes": [
-            "DebertaTokenizer",
-            "DebertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaForQuestionAnswering",
-            "TFDebertaForQuestionAnswering"
-        ],
-        "sha": "a3eb69cdb0b52f7d0fb730e882f1a54b9a7442ea"
-    },
-    "DebertaForSequenceClassification": {
-        "tokenizer_classes": [
-            "DebertaTokenizer",
-            "DebertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaForSequenceClassification",
-            "TFDebertaForSequenceClassification"
-        ],
-        "sha": "32af91d12c4e9b6d62b420bee93311fd77d3c933"
-    },
-    "DebertaForTokenClassification": {
-        "tokenizer_classes": [
-            "DebertaTokenizer",
-            "DebertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaForTokenClassification",
-            "TFDebertaForTokenClassification"
-        ],
-        "sha": "ba62ba2726d813e60e512476fc1b178aa3858175"
-    },
-    "DebertaModel": {
-        "tokenizer_classes": [
-            "DebertaTokenizer",
-            "DebertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaModel",
-            "TFDebertaModel"
-        ],
-        "sha": "4273294e14cd04c0e2cd1dcff5cf7e5d4fe906ba"
-    },
-    "DebertaV2ForMaskedLM": {
-        "tokenizer_classes": [
-            "DebertaV2Tokenizer",
-            "DebertaV2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaV2ForMaskedLM",
-            "TFDebertaV2ForMaskedLM"
-        ],
-        "sha": "a053dedc2cdf32918a84277cb0c05186604496a5"
-    },
-    "DebertaV2ForMultipleChoice": {
-        "tokenizer_classes": [
-            "DebertaV2Tokenizer",
-            "DebertaV2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaV2ForMultipleChoice"
-        ],
-        "sha": "07e39f520ce239b39ef8cb24cd7874d06c791063"
-    },
-    "DebertaV2ForQuestionAnswering": {
-        "tokenizer_classes": [
-            "DebertaV2Tokenizer",
-            "DebertaV2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaV2ForQuestionAnswering",
-            "TFDebertaV2ForQuestionAnswering"
-        ],
-        "sha": "9cecb3a7fc6b95099122283644ea1f8ced287d1b"
-    },
-    "DebertaV2ForSequenceClassification": {
-        "tokenizer_classes": [
-            "DebertaV2Tokenizer",
-            "DebertaV2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaV2ForSequenceClassification",
-            "TFDebertaV2ForSequenceClassification"
-        ],
-        "sha": "df9ea1f5c0f2ccd139b21cfb3963a5a5ebfb5b81"
-    },
-    "DebertaV2ForTokenClassification": {
-        "tokenizer_classes": [
-            "DebertaV2Tokenizer",
-            "DebertaV2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaV2ForTokenClassification",
-            "TFDebertaV2ForTokenClassification"
-        ],
-        "sha": "51fe01989df38a540ac1abca5ee71a51365defd5"
-    },
-    "DebertaV2Model": {
-        "tokenizer_classes": [
-            "DebertaV2Tokenizer",
-            "DebertaV2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DebertaV2Model",
-            "TFDebertaV2Model"
-        ],
-        "sha": "211df4bd1a4a9b66c97af3f9231a5d2af8de7b9f"
-    },
-    "DeformableDetrForObjectDetection": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DeformableDetrImageProcessor"
-        ],
-        "model_classes": [
-            "DeformableDetrForObjectDetection"
-        ],
-        "sha": "8fa0db215c458f60ae4d455d6fb067c1c5e39fdc"
-    },
-    "DeformableDetrModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DeformableDetrImageProcessor"
-        ],
-        "model_classes": [
-            "DeformableDetrModel"
-        ],
-        "sha": "0faac5624696b03edd14694642f9804f2cd8f3da"
-    },
-    "DeiTForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DeiTImageProcessor"
-        ],
-        "model_classes": [
-            "DeiTForImageClassification",
-            "TFDeiTForImageClassification"
-        ],
-        "sha": "21fc864199dafa0130f16a45769c6b6ca22c7784"
-    },
-    "DeiTForImageClassificationWithTeacher": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DeiTImageProcessor"
-        ],
-        "model_classes": [
-            "DeiTForImageClassificationWithTeacher",
-            "TFDeiTForImageClassificationWithTeacher"
-        ],
-        "sha": "5a5738a109e27f3d4b78a0db4cb1d3331140c10e"
-    },
-    "DeiTForMaskedImageModeling": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DeiTImageProcessor"
-        ],
-        "model_classes": [
-            "DeiTForMaskedImageModeling",
-            "TFDeiTForMaskedImageModeling"
-        ],
-        "sha": "d5df5c538fe1efb8d668a3893d1691d505a0de06"
-    },
-    "DeiTModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DeiTImageProcessor"
-        ],
-        "model_classes": [
-            "DeiTModel",
-            "TFDeiTModel"
-        ],
-        "sha": "0fdbff6f44b7c6933c2027fec1d7f87bec06b590"
-    },
-    "DetaForObjectDetection": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DetaImageProcessor"
-        ],
-        "model_classes": [
-            "DetaForObjectDetection"
-        ],
-        "sha": "a15ad6ce64fbcb5021b2b99e9587c4011ef3341d"
-    },
-    "DetaModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DetaImageProcessor"
-        ],
-        "model_classes": [
-            "DetaModel"
-        ],
-        "sha": "8820f2297ec0dec8f1875054559c8b7a162098e3"
-    },
-    "DetrForObjectDetection": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DetrImageProcessor"
-        ],
-        "model_classes": [
-            "DetrForObjectDetection"
-        ],
-        "sha": "7dc967c53f4b3f07904c42b255346b744d0ad84e"
-    },
-    "DetrForSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DetrImageProcessor"
-        ],
-        "model_classes": [
-            "DetrForSegmentation"
-        ],
-        "sha": "e34330acdae359588ef853e961a78d419dc4e8eb"
-    },
-    "DetrModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DetrImageProcessor"
-        ],
-        "model_classes": [
-            "DetrModel"
-        ],
-        "sha": "f15ce38a10c7447e8048b1681e4811322a005722"
-    },
-    "DinatBackbone": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "DinatBackbone"
-        ],
-        "sha": "3ba13790a0796d90104c207f75bb3d5d79723d51"
-    },
-    "DinatForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "DinatForImageClassification"
-        ],
-        "sha": "624cf2d864a7ea2f90e24014a213e34597e8bd76"
-    },
-    "DinatModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "DinatModel"
-        ],
-        "sha": "d6c75bc51196f0a683afb12de6310fdda13efefd"
-    },
-    "Dinov2ForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "Dinov2ForImageClassification"
-        ],
-        "sha": "ae44840966456aae33641df2c8c8a4af5b457b24"
-    },
-    "Dinov2Model": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "Dinov2Model"
-        ],
-        "sha": "6f560b1cc9806bcf84fe0b0c60b5faf9c29be959"
-    },
-    "DistilBertForMaskedLM": {
-        "tokenizer_classes": [
-            "DistilBertTokenizer",
-            "DistilBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DistilBertForMaskedLM",
-            "TFDistilBertForMaskedLM"
-        ],
-        "sha": "b2dfda30b012821996e6e603729562d9c900bc0f"
-    },
-    "DistilBertForMultipleChoice": {
-        "tokenizer_classes": [
-            "DistilBertTokenizer",
-            "DistilBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DistilBertForMultipleChoice",
-            "TFDistilBertForMultipleChoice"
-        ],
-        "sha": "ec6b83129a7d1be2a6b8d58303abcca5541a5cb3"
-    },
-    "DistilBertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "DistilBertTokenizer",
-            "DistilBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DistilBertForQuestionAnswering",
-            "TFDistilBertForQuestionAnswering"
-        ],
-        "sha": "812406b226415044469b0e0a84c4fe0ff338c5d3"
-    },
-    "DistilBertForSequenceClassification": {
-        "tokenizer_classes": [
-            "DistilBertTokenizer",
-            "DistilBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DistilBertForSequenceClassification",
-            "TFDistilBertForSequenceClassification"
-        ],
-        "sha": "6f427ce7b3e5aaa596938fbd98437d3875581b7b"
-    },
-    "DistilBertForTokenClassification": {
-        "tokenizer_classes": [
-            "DistilBertTokenizer",
-            "DistilBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DistilBertForTokenClassification",
-            "TFDistilBertForTokenClassification"
-        ],
-        "sha": "166dbe3f5d6ecd871762567069454d6ec65234b4"
-    },
-    "DistilBertModel": {
-        "tokenizer_classes": [
-            "DistilBertTokenizer",
-            "DistilBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "DistilBertModel",
-            "TFDistilBertModel"
-        ],
-        "sha": "cc4425ad0676f3ec00e8bffe485fe83cae61041a"
-    },
-    "DonutSwinModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DonutImageProcessor"
-        ],
-        "model_classes": [
-            "DonutSwinModel"
-        ],
-        "sha": "1b10654fbfe2f2ea410a672ab605bd5c60d3f284"
-    },
-    "EfficientFormerForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "EfficientFormerImageProcessor"
-        ],
-        "model_classes": [
-            "EfficientFormerForImageClassification",
-            "TFEfficientFormerForImageClassification"
-        ],
-        "sha": "ebadb628e12f268e321fcc756fa4606f7b5b3178"
-    },
-    "EfficientFormerForImageClassificationWithTeacher": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "EfficientFormerImageProcessor"
-        ],
-        "model_classes": [
-            "EfficientFormerForImageClassificationWithTeacher",
-            "TFEfficientFormerForImageClassificationWithTeacher"
-        ],
-        "sha": "1beabce6da9cb4ebbeafcd1ef23fac36b4a269e2"
-    },
-    "EfficientFormerModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "EfficientFormerImageProcessor"
-        ],
-        "model_classes": [
-            "EfficientFormerModel",
-            "TFEfficientFormerModel"
-        ],
-        "sha": "200fae5b875844d09c8a91d1c155b72b06a517f6"
-    },
-    "EfficientNetForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "EfficientNetImageProcessor"
-        ],
-        "model_classes": [
-            "EfficientNetForImageClassification"
-        ],
-        "sha": "6ed195ee636d2c0b885139da8c7b45d57ebaeee0"
-    },
-    "EfficientNetModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "EfficientNetImageProcessor"
-        ],
-        "model_classes": [
-            "EfficientNetModel"
-        ],
-        "sha": "eb03c90d4aaad98af0f19e0dfbdc41106297ffff"
-    },
-    "ElectraForCausalLM": {
-        "tokenizer_classes": [
-            "ElectraTokenizer",
-            "ElectraTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ElectraForCausalLM"
-        ],
-        "sha": "c78396bc8cdd8db247892339de8da80d691d1d04"
-    },
-    "ElectraForMaskedLM": {
-        "tokenizer_classes": [
-            "ElectraTokenizer",
-            "ElectraTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ElectraForMaskedLM",
-            "TFElectraForMaskedLM"
-        ],
-        "sha": "631337703dbd8d41904c39891a41c6f1edd31813"
-    },
-    "ElectraForMultipleChoice": {
-        "tokenizer_classes": [
-            "ElectraTokenizer",
-            "ElectraTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ElectraForMultipleChoice",
-            "TFElectraForMultipleChoice"
-        ],
-        "sha": "66fdea6e22cfcbd3caa49ea82f31871c460612fa"
-    },
-    "ElectraForPreTraining": {
-        "tokenizer_classes": [
-            "ElectraTokenizer",
-            "ElectraTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ElectraForPreTraining",
-            "TFElectraForPreTraining"
-        ],
-        "sha": "7b2d0fa8726b1180c7d6cde4f4afc3800eba7e6f"
-    },
-    "ElectraForQuestionAnswering": {
-        "tokenizer_classes": [
-            "ElectraTokenizer",
-            "ElectraTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ElectraForQuestionAnswering",
-            "TFElectraForQuestionAnswering"
-        ],
-        "sha": "c6b127fd9f3019462e4ca2373762836207e39ce2"
-    },
-    "ElectraForSequenceClassification": {
-        "tokenizer_classes": [
-            "ElectraTokenizer",
-            "ElectraTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ElectraForSequenceClassification",
-            "TFElectraForSequenceClassification"
-        ],
-        "sha": "41f0089ab7876abe0e28dbbd565144acb31f8127"
-    },
-    "ElectraForTokenClassification": {
-        "tokenizer_classes": [
-            "ElectraTokenizer",
-            "ElectraTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ElectraForTokenClassification",
-            "TFElectraForTokenClassification"
-        ],
-        "sha": "1fdbbe70c1ddd16503820a1443d6a379a15ed777"
-    },
-    "ElectraModel": {
-        "tokenizer_classes": [
-            "ElectraTokenizer",
-            "ElectraTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ElectraModel",
-            "TFElectraModel"
-        ],
-        "sha": "312b532cbef26610d80f2bd008650160cae4f7a1"
-    },
-    "EncodecModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "EncodecFeatureExtractor"
-        ],
-        "model_classes": [
-            "EncodecModel"
-        ],
-        "sha": "e14c5a2fd6529c85cd4ac5a05ee9e550ced6a006"
-    },
-    "EncoderDecoderModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "EncoderDecoderModel",
-            "TFEncoderDecoderModel"
-        ],
-        "sha": "1038be9fd1b87b2e0a8f33721ff8e4612d34b3b6"
-    },
-    "ErnieForCausalLM": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieForCausalLM"
-        ],
-        "sha": "b49e00112ff06c2f0a0e54499921dddcf8c3c6a8"
-    },
-    "ErnieForMaskedLM": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieForMaskedLM"
-        ],
-        "sha": "30429830d1997222d885dcfdbd36d5e02d0d34b1"
-    },
-    "ErnieForMultipleChoice": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieForMultipleChoice"
-        ],
-        "sha": "5a21144bf35dfb60560ff8249116ad4459c0069a"
-    },
-    "ErnieForNextSentencePrediction": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieForNextSentencePrediction"
-        ],
-        "sha": "ed5868efb39bf6afb29f0cf444deafcf1e50b5bc"
-    },
-    "ErnieForPreTraining": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieForPreTraining"
-        ],
-        "sha": "e4ad30d291c310fea25e6f91f91393f993513b42"
-    },
-    "ErnieForQuestionAnswering": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieForQuestionAnswering"
-        ],
-        "sha": "fe7c74b763f63a9fd864dad325385075df7c80c8"
-    },
-    "ErnieForSequenceClassification": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieForSequenceClassification"
-        ],
-        "sha": "84e0be05fcd52f54e96a69f67a2481323a58a9db"
-    },
-    "ErnieForTokenClassification": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieForTokenClassification"
-        ],
-        "sha": "91cf62c43a5a83332552ffa2d8e5e44d63a224ea"
-    },
-    "ErnieMForMultipleChoice": {
-        "tokenizer_classes": [
-            "ErnieMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieMForMultipleChoice"
-        ],
-        "sha": "c42ee7fcb132a323ace314c32e63c8a7d36ce18f"
-    },
-    "ErnieMForQuestionAnswering": {
-        "tokenizer_classes": [
-            "ErnieMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieMForQuestionAnswering"
-        ],
-        "sha": "2b90dee75ca87b214f96db00002aa18244ec8e84"
-    },
-    "ErnieMForSequenceClassification": {
-        "tokenizer_classes": [
-            "ErnieMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieMForSequenceClassification"
-        ],
-        "sha": "d8368646d8b1c67b1460af9c6ec13fd9d894cae6"
-    },
-    "ErnieMForTokenClassification": {
-        "tokenizer_classes": [
-            "ErnieMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieMForTokenClassification"
-        ],
-        "sha": "a9e29ba60fa0b7bedc2ed26a6b9911427df1ca6b"
-    },
-    "ErnieMModel": {
-        "tokenizer_classes": [
-            "ErnieMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieMModel"
-        ],
-        "sha": "7306eac3f38c3cf6211f0e741fdb81c6cc92bc09"
-    },
-    "ErnieModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ErnieModel"
-        ],
-        "sha": "b51478a9f40e353c41be3a29ccef103dcfe22b4b"
-    },
-    "EsmForMaskedLM": {
-        "tokenizer_classes": [
-            "EsmTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "EsmForMaskedLM",
-            "TFEsmForMaskedLM"
-        ],
-        "sha": "b56297b6cd64b9ba7c613d0cd146f1ecbea8115e"
-    },
-    "EsmForSequenceClassification": {
-        "tokenizer_classes": [
-            "EsmTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "EsmForSequenceClassification",
-            "TFEsmForSequenceClassification"
-        ],
-        "sha": "cc6d7ef0a4763540d67b7a4fb31bede9a7d3f245"
-    },
-    "EsmForTokenClassification": {
-        "tokenizer_classes": [
-            "EsmTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "EsmForTokenClassification",
-            "TFEsmForTokenClassification"
-        ],
-        "sha": "498953f66e260b974c504abbc863ee266d6c84a9"
-    },
-    "EsmModel": {
-        "tokenizer_classes": [
-            "EsmTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "EsmModel",
-            "TFEsmModel"
-        ],
-        "sha": "183838263b70809310117a0761542501acf64c21"
-    },
-    "FNetForMaskedLM": {
-        "tokenizer_classes": [
-            "FNetTokenizer",
-            "FNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FNetForMaskedLM"
-        ],
-        "sha": "91eaae1eac894af5d96c0221ec9bcef7f1af41c8"
-    },
-    "FNetForMultipleChoice": {
-        "tokenizer_classes": [
-            "FNetTokenizer",
-            "FNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FNetForMultipleChoice"
-        ],
-        "sha": "c15d98d5f7a6f3ef3099b1257949bee208d5466e"
-    },
-    "FNetForNextSentencePrediction": {
-        "tokenizer_classes": [
-            "FNetTokenizer",
-            "FNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FNetForNextSentencePrediction"
-        ],
-        "sha": "c59440b44d07d61fc45a90ded7fc11d6f25b143d"
-    },
-    "FNetForPreTraining": {
-        "tokenizer_classes": [
-            "FNetTokenizer",
-            "FNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FNetForPreTraining"
-        ],
-        "sha": "c05f55ccfb2f2533babd3c6e99de7749bc8081da"
-    },
-    "FNetForQuestionAnswering": {
-        "tokenizer_classes": [
-            "FNetTokenizer",
-            "FNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FNetForQuestionAnswering"
-        ],
-        "sha": "47788e49dd435653fa2aa4b3ccae3572a870758e"
-    },
-    "FNetForSequenceClassification": {
-        "tokenizer_classes": [
-            "FNetTokenizer",
-            "FNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FNetForSequenceClassification"
-        ],
-        "sha": "a3049b896ea6c5a32c364989c3afe604ee58b9fc"
-    },
-    "FNetForTokenClassification": {
-        "tokenizer_classes": [
-            "FNetTokenizer",
-            "FNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FNetForTokenClassification"
-        ],
-        "sha": "3bcdafca57d544bb81e2f7eead1e512c168582fc"
-    },
-    "FNetModel": {
-        "tokenizer_classes": [
-            "FNetTokenizer",
-            "FNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FNetModel"
-        ],
-        "sha": "48fa66de37df126504db3b658806135eb877f505"
-    },
-    "FSMTForConditionalGeneration": {
-        "tokenizer_classes": [
-            "FSMTTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FSMTForConditionalGeneration"
-        ],
-        "sha": "6a1a981b29c8a98c1fd31bd0ad809f5575ca6c7a"
-    },
-    "FSMTModel": {
-        "tokenizer_classes": [
-            "FSMTTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FSMTModel"
-        ],
-        "sha": "683f6f73a2ab87801f1695a72d1af63cf173ab7c"
-    },
-    "FlaubertForMultipleChoice": {
-        "tokenizer_classes": [
-            "FlaubertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FlaubertForMultipleChoice",
-            "TFFlaubertForMultipleChoice"
-        ],
-        "sha": "8b12bd87a63f2e86c3482431742f6d8abf6ec4fd"
-    },
-    "FlaubertForQuestionAnsweringSimple": {
-        "tokenizer_classes": [
-            "FlaubertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FlaubertForQuestionAnsweringSimple",
-            "TFFlaubertForQuestionAnsweringSimple"
-        ],
-        "sha": "5c0e7ad1efae7e3497f5cd6d2d9519403df49d37"
-    },
-    "FlaubertForSequenceClassification": {
-        "tokenizer_classes": [
-            "FlaubertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FlaubertForSequenceClassification",
-            "TFFlaubertForSequenceClassification"
-        ],
-        "sha": "762f12a8c99690be8ed2663b7af3011660174a7c"
-    },
-    "FlaubertForTokenClassification": {
-        "tokenizer_classes": [
-            "FlaubertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FlaubertForTokenClassification",
-            "TFFlaubertForTokenClassification"
-        ],
-        "sha": "d2ab741c937bb69ef27c89e4c86a8c9d444874ca"
-    },
-    "FlaubertModel": {
-        "tokenizer_classes": [
-            "FlaubertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FlaubertModel",
-            "TFFlaubertModel"
-        ],
-        "sha": "bdc2f8e17bb869393053429ec8c1c842bfeabb07"
-    },
-    "FlaubertWithLMHeadModel": {
-        "tokenizer_classes": [
-            "FlaubertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FlaubertWithLMHeadModel",
-            "TFFlaubertWithLMHeadModel"
-        ],
-        "sha": "f20eb0932c90061003c9cc4e109c6ea22559c4f2"
-    },
-    "FlavaForPreTraining": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "FlavaImageProcessor"
-        ],
-        "model_classes": [
-            "FlavaForPreTraining"
-        ],
-        "sha": "6e9b2094060a5fa27984c7b49e5d0e820a88b487"
-    },
-    "FlavaModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "FlavaImageProcessor"
-        ],
-        "model_classes": [
-            "FlavaModel"
-        ],
-        "sha": "31ebf1b7a0ef1fd5059b98e28e5ab1c366d2c482"
-    },
-    "FocalNetBackbone": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "FocalNetBackbone"
-        ],
-        "sha": "eb8c580969443cb87de7dd9a256deaface03692f"
-    },
-    "FocalNetForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "FocalNetForImageClassification"
-        ],
-        "sha": "28d30ded26a3213e8fb7011a455afc3aa98b0a95"
-    },
-    "FocalNetForMaskedImageModeling": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "FocalNetForMaskedImageModeling"
-        ],
-        "sha": "0ea7626d19c9dd2f3113d977f643a1babc720bd3"
-    },
-    "FocalNetModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "BitImageProcessor"
-        ],
-        "model_classes": [
-            "FocalNetModel"
-        ],
-        "sha": "107b004e6aa14108a359b7d22bdb9aa141ec05d5"
-    },
-    "FunnelBaseModel": {
-        "tokenizer_classes": [
-            "FunnelTokenizer",
-            "FunnelTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FunnelBaseModel",
-            "TFFunnelBaseModel"
-        ],
-        "sha": "87fed4252812df23315a56531625333e315681c6"
-    },
-    "FunnelForMaskedLM": {
-        "tokenizer_classes": [
-            "FunnelTokenizer",
-            "FunnelTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FunnelForMaskedLM",
-            "TFFunnelForMaskedLM"
-        ],
-        "sha": "5543daf29f185cd45f2599bd6f38c96064c9c8de"
-    },
-    "FunnelForMultipleChoice": {
-        "tokenizer_classes": [
-            "FunnelTokenizer",
-            "FunnelTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FunnelForMultipleChoice",
-            "TFFunnelForMultipleChoice"
-        ],
-        "sha": "a8bf597e37dbefb1ac5c97c4cb162c3d522a33a1"
-    },
-    "FunnelForPreTraining": {
-        "tokenizer_classes": [
-            "FunnelTokenizer",
-            "FunnelTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FunnelForPreTraining",
-            "TFFunnelForPreTraining"
-        ],
-        "sha": "cbcb300d60aacd5950a45409b6e3f0f240c9082e"
-    },
-    "FunnelForQuestionAnswering": {
-        "tokenizer_classes": [
-            "FunnelTokenizer",
-            "FunnelTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FunnelForQuestionAnswering",
-            "TFFunnelForQuestionAnswering"
-        ],
-        "sha": "6a5675305e096434e818486a13892cb55daffd13"
-    },
-    "FunnelForSequenceClassification": {
-        "tokenizer_classes": [
-            "FunnelTokenizer",
-            "FunnelTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FunnelForSequenceClassification",
-            "TFFunnelForSequenceClassification"
-        ],
-        "sha": "1bc557a1e4314da21a44dee57b799e95a7025e5c"
-    },
-    "FunnelForTokenClassification": {
-        "tokenizer_classes": [
-            "FunnelTokenizer",
-            "FunnelTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FunnelForTokenClassification",
-            "TFFunnelForTokenClassification"
-        ],
-        "sha": "693bc1217a224efd558f410ddc8ffc63739bebc3"
-    },
-    "FunnelModel": {
-        "tokenizer_classes": [
-            "FunnelTokenizer",
-            "FunnelTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "FunnelModel",
-            "TFFunnelModel"
-        ],
-        "sha": "bfbaa8fa21c3abf80b94e7168b5ecff8ec5b5f76"
-    },
-    "GLPNForDepthEstimation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "GLPNImageProcessor"
-        ],
-        "model_classes": [
-            "GLPNForDepthEstimation"
-        ],
-        "sha": "32ca1c1ef5d33242e5e7c0433bcd773c082f0260"
-    },
-    "GLPNModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "GLPNImageProcessor"
-        ],
-        "model_classes": [
-            "GLPNModel"
-        ],
-        "sha": "24a8dbb48b1aa0ba2eba44324fcd0c78cca64dd4"
-    },
-    "GPT2ForQuestionAnswering": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPT2ForQuestionAnswering"
-        ],
-        "sha": "a5bdd6bd4d79feece85ea9a8bd4ee5fe54c1d45b"
-    },
-    "GPT2ForSequenceClassification": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPT2ForSequenceClassification",
-            "TFGPT2ForSequenceClassification"
-        ],
-        "sha": "90a2d78e5c7f288152f8456c3d58a43b40a58449"
-    },
-    "GPT2ForTokenClassification": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPT2ForTokenClassification"
-        ],
-        "sha": "da78bc95b45fab2da9d43f2ca27164996e31ade1"
-    },
-    "GPT2LMHeadModel": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPT2LMHeadModel",
-            "TFGPT2LMHeadModel"
-        ],
-        "sha": "78f56535d4ce19e9d7c0992e390085c5a4196b37"
-    },
-    "GPT2Model": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPT2Model",
-            "TFGPT2Model"
-        ],
-        "sha": "d6694b0d8fe17978761c9305dc151780506b192e"
-    },
-    "GPTBigCodeForCausalLM": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTBigCodeForCausalLM"
-        ],
-        "sha": "99f7aaadf9c29669c63ef6c16f6bc5c07dbb9126"
-    },
-    "GPTBigCodeForSequenceClassification": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTBigCodeForSequenceClassification"
-        ],
-        "sha": "64a7398d5763161037b818314c60dd83d93d03e9"
-    },
-    "GPTBigCodeForTokenClassification": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTBigCodeForTokenClassification"
-        ],
-        "sha": "310537ecd22d45f71bf594b17922cf2abc338eaf"
-    },
-    "GPTBigCodeModel": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTBigCodeModel"
-        ],
-        "sha": "3069419084a9dc36802d47de9df3d314ccfc2f28"
-    },
-    "GPTJForCausalLM": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTJForCausalLM",
-            "TFGPTJForCausalLM"
-        ],
-        "sha": "1fff390baa45cb187903ebdd269c975bb9ed7386"
-    },
-    "GPTJForQuestionAnswering": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTJForQuestionAnswering",
-            "TFGPTJForQuestionAnswering"
-        ],
-        "sha": "3d4ec61dbed01f844d4c309971eeb5ad722c6c84"
-    },
-    "GPTJForSequenceClassification": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTJForSequenceClassification",
-            "TFGPTJForSequenceClassification"
-        ],
-        "sha": "4b5db259cd16ca84ae2cd79aa4851cdd14479128"
-    },
-    "GPTJModel": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTJModel",
-            "TFGPTJModel"
-        ],
-        "sha": "d8e1db30d08fbf57da6fc139aea3ffd63ab6226e"
-    },
-    "GPTNeoForCausalLM": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoForCausalLM"
-        ],
-        "sha": "e88934e402c15195dd99b2947632415dd7645268"
-    },
-    "GPTNeoForQuestionAnswering": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoForQuestionAnswering"
-        ],
-        "sha": "623883e94bd08caf9b3f839b98debeea72d5bc2b"
-    },
-    "GPTNeoForSequenceClassification": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoForSequenceClassification"
-        ],
-        "sha": "bf2090d5d91a70eb37ba51fbdcf23afc7031fea8"
-    },
-    "GPTNeoForTokenClassification": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoForTokenClassification"
-        ],
-        "sha": "d5208e73e24a1671219776b50fe5f96e0e4cd218"
-    },
-    "GPTNeoModel": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoModel"
-        ],
-        "sha": "72a7cd49da613c3125a90884df4763545c594e56"
-    },
-    "GPTNeoXForCausalLM": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoXForCausalLM"
-        ],
-        "sha": "0229cfaaa843c6b492ac2abffabb00f1ff1936f8"
-    },
-    "GPTNeoXForQuestionAnswering": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoXForQuestionAnswering"
-        ],
-        "sha": "7d2f08c959c211129952ee03b5562add09fe6864"
-    },
-    "GPTNeoXForSequenceClassification": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoXForSequenceClassification"
-        ],
-        "sha": "17c4b845ee2e0bb780ca2dea2d59a3d9d5d3c651"
-    },
-    "GPTNeoXForTokenClassification": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoXForTokenClassification"
-        ],
-        "sha": "3aa4fe8a562f32230041d6d3616aa5ecc3f30192"
-    },
-    "GPTNeoXJapaneseForCausalLM": {
-        "tokenizer_classes": [
-            "GPTNeoXJapaneseTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoXJapaneseForCausalLM"
-        ],
-        "sha": "5fca2479f1064fd22e17f944c8fcc14f7e73f1d5"
-    },
-    "GPTNeoXJapaneseModel": {
-        "tokenizer_classes": [
-            "GPTNeoXJapaneseTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoXJapaneseModel"
-        ],
-        "sha": "5c6ed124150df845cfc701d70b97fdcde687be52"
-    },
-    "GPTNeoXModel": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTNeoXModel"
-        ],
-        "sha": "33114ba2f72189d5a2bd63f0cdb78551189242ff"
-    },
-    "GPTSanJapaneseForConditionalGeneration": {
-        "tokenizer_classes": [
-            "GPTSanJapaneseTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "GPTSanJapaneseForConditionalGeneration"
-        ],
-        "sha": "ff6a41faaa713c7fbd5d9a1a50539745f9e1178e"
-    },
-    "GitForCausalLM": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "CLIPImageProcessor"
-        ],
-        "model_classes": [
-            "GitForCausalLM"
-        ],
-        "sha": "60f9c50466ae0beeb11776ca5bfeb6473f441554"
-    },
-    "GitModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "CLIPImageProcessor"
-        ],
-        "model_classes": [
-            "GitModel"
-        ],
-        "sha": "3d2eb6bddf95bb4a4e59b045d4e464c730c07f41"
-    },
-    "GroupViTModel": {
-        "tokenizer_classes": [
-            "CLIPTokenizer",
-            "CLIPTokenizerFast"
-        ],
-        "processor_classes": [
-            "CLIPImageProcessor"
-        ],
-        "model_classes": [
-            "GroupViTModel",
-            "TFGroupViTModel"
-        ],
-        "sha": "05a3a02dd46cb9eb078608dec98f633c0cf559ef"
-    },
-    "HubertForCTC": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "HubertForCTC"
-        ],
-        "sha": "13431b76106f993eedcff48a75bae590a09b14f7"
-    },
-    "HubertForSequenceClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "HubertForSequenceClassification"
-        ],
-        "sha": "d23f46607a900b1a55dfee4b7ed205a6823035b1"
-    },
-    "HubertModel": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "HubertModel",
-            "TFHubertModel"
-        ],
-        "sha": "3224562c86c4669db65ae7defdc5fb555b113e95"
-    },
-    "IBertForMaskedLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "IBertForMaskedLM"
-        ],
-        "sha": "e333a9c9d375f4d839b7e9e21d1a1c8dad58d7d1"
-    },
-    "IBertForMultipleChoice": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "IBertForMultipleChoice"
-        ],
-        "sha": "a81f7d64cd7ce5fe6cd726b23d9d14ac5d17bf53"
-    },
-    "IBertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "IBertForQuestionAnswering"
-        ],
-        "sha": "7b66d13d4d6801a82cbeb7f9fd853ca1630d1f8b"
-    },
-    "IBertForSequenceClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "IBertForSequenceClassification"
-        ],
-        "sha": "309d57145c40f889222fe5df62f14dddf4496b38"
-    },
-    "IBertForTokenClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "IBertForTokenClassification"
-        ],
-        "sha": "b032e9bff4b081b78c098b2d8bc610ac035c6ddf"
-    },
-    "IBertModel": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "IBertModel"
-        ],
-        "sha": "6749164c678d4883d455f98b1dfc98c62da8f08b"
-    },
-    "ImageGPTForCausalImageModeling": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ImageGPTImageProcessor"
-        ],
-        "model_classes": [
-            "ImageGPTForCausalImageModeling"
-        ],
-        "sha": "9a7d1fc04439ab1d9d690de9c3e7673f08568cdf"
-    },
-    "ImageGPTForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ImageGPTImageProcessor"
-        ],
-        "model_classes": [
-            "ImageGPTForImageClassification"
-        ],
-        "sha": "d92c7aed4ba5de74a1f542b736010090e4a58b42"
-    },
-    "ImageGPTModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ImageGPTImageProcessor"
-        ],
-        "model_classes": [
-            "ImageGPTModel"
-        ],
-        "sha": "5a7983e48d5841704733dd0756177680ed50c074"
-    },
-    "LEDForConditionalGeneration": {
-        "tokenizer_classes": [
-            "LEDTokenizer",
-            "LEDTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LEDForConditionalGeneration",
-            "TFLEDForConditionalGeneration"
-        ],
-        "sha": "a354b49a79351f3ea8ae7776d9f8352ae26cfc14"
-    },
-    "LEDForQuestionAnswering": {
-        "tokenizer_classes": [
-            "LEDTokenizer",
-            "LEDTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LEDForQuestionAnswering"
-        ],
-        "sha": "47c7a75a1e650dae60ff6e9bbab0f2386946670c"
-    },
-    "LEDForSequenceClassification": {
-        "tokenizer_classes": [
-            "LEDTokenizer",
-            "LEDTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LEDForSequenceClassification"
-        ],
-        "sha": "3571e2c9d9f2f2ec0b8fe47090330b128be05126"
-    },
-    "LEDModel": {
-        "tokenizer_classes": [
-            "LEDTokenizer",
-            "LEDTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LEDModel",
-            "TFLEDModel"
-        ],
-        "sha": "3c3f6eb142545afc570187bfdabfe65d43dafbe4"
-    },
-    "LayoutLMForMaskedLM": {
-        "tokenizer_classes": [
-            "LayoutLMTokenizer",
-            "LayoutLMTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LayoutLMForMaskedLM",
-            "TFLayoutLMForMaskedLM"
-        ],
-        "sha": "0368bd9bd8fd3eb43b8a3b38962b5345b8765514"
-    },
-    "LayoutLMForQuestionAnswering": {
-        "tokenizer_classes": [
-            "LayoutLMTokenizer",
-            "LayoutLMTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LayoutLMForQuestionAnswering",
-            "TFLayoutLMForQuestionAnswering"
-        ],
-        "sha": "0d6a4bc614fccfa313c1fb6d132a250929518f85"
-    },
-    "LayoutLMForSequenceClassification": {
-        "tokenizer_classes": [
-            "LayoutLMTokenizer",
-            "LayoutLMTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LayoutLMForSequenceClassification",
-            "TFLayoutLMForSequenceClassification"
-        ],
-        "sha": "1bd68c73dbf6c8c0526d24fbe2831be82998c440"
-    },
-    "LayoutLMForTokenClassification": {
-        "tokenizer_classes": [
-            "LayoutLMTokenizer",
-            "LayoutLMTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LayoutLMForTokenClassification",
-            "TFLayoutLMForTokenClassification"
-        ],
-        "sha": "155e7da3f1d786aa39d957b16080c52de4a7efd7"
-    },
-    "LayoutLMModel": {
-        "tokenizer_classes": [
-            "LayoutLMTokenizer",
-            "LayoutLMTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LayoutLMModel",
-            "TFLayoutLMModel"
-        ],
-        "sha": "14f77b30d267910f11f0fd532a91a6b85ab3a4de"
-    },
-    "LayoutLMv2ForQuestionAnswering": {
-        "tokenizer_classes": [
-            "LayoutLMv2Tokenizer",
-            "LayoutLMv2TokenizerFast"
-        ],
-        "processor_classes": [
-            "LayoutLMv2ImageProcessor"
-        ],
-        "model_classes": [
-            "LayoutLMv2ForQuestionAnswering"
-        ],
-        "sha": "f452e28dd34d3c38cce046b1cc7b0ada69f587b1"
-    },
-    "LayoutLMv2ForSequenceClassification": {
-        "tokenizer_classes": [
-            "LayoutLMv2Tokenizer",
-            "LayoutLMv2TokenizerFast"
-        ],
-        "processor_classes": [
-            "LayoutLMv2ImageProcessor"
-        ],
-        "model_classes": [
-            "LayoutLMv2ForSequenceClassification"
-        ],
-        "sha": "b483e08fd143113629ecda3dbfd57e69bfeb5f11"
-    },
-    "LayoutLMv2ForTokenClassification": {
-        "tokenizer_classes": [
-            "LayoutLMv2Tokenizer",
-            "LayoutLMv2TokenizerFast"
-        ],
-        "processor_classes": [
-            "LayoutLMv2ImageProcessor"
-        ],
-        "model_classes": [
-            "LayoutLMv2ForTokenClassification"
-        ],
-        "sha": "0721ae69bff00ecfff1b3d1521a475cde0253299"
-    },
-    "LayoutLMv2Model": {
-        "tokenizer_classes": [
-            "LayoutLMv2Tokenizer",
-            "LayoutLMv2TokenizerFast"
-        ],
-        "processor_classes": [
-            "LayoutLMv2ImageProcessor"
-        ],
-        "model_classes": [
-            "LayoutLMv2Model"
-        ],
-        "sha": "6a1b510769b344979a910a7d0bade613a9ec2dfc"
-    },
-    "LayoutLMv3ForQuestionAnswering": {
-        "tokenizer_classes": [
-            "LayoutLMv3Tokenizer",
-            "LayoutLMv3TokenizerFast"
-        ],
-        "processor_classes": [
-            "LayoutLMv3ImageProcessor"
-        ],
-        "model_classes": [
-            "LayoutLMv3ForQuestionAnswering",
-            "TFLayoutLMv3ForQuestionAnswering"
-        ],
-        "sha": "4640242388e69cf77ea2dd3ac36ec6f1b26628c8"
-    },
-    "LayoutLMv3ForSequenceClassification": {
-        "tokenizer_classes": [
-            "LayoutLMv3Tokenizer",
-            "LayoutLMv3TokenizerFast"
-        ],
-        "processor_classes": [
-            "LayoutLMv3ImageProcessor"
-        ],
-        "model_classes": [
-            "LayoutLMv3ForSequenceClassification",
-            "TFLayoutLMv3ForSequenceClassification"
-        ],
-        "sha": "96515f699874cfbfbec7a64c539ae92419e4c6dc"
-    },
-    "LayoutLMv3ForTokenClassification": {
-        "tokenizer_classes": [
-            "LayoutLMv3Tokenizer",
-            "LayoutLMv3TokenizerFast"
-        ],
-        "processor_classes": [
-            "LayoutLMv3ImageProcessor"
-        ],
-        "model_classes": [
-            "LayoutLMv3ForTokenClassification",
-            "TFLayoutLMv3ForTokenClassification"
-        ],
-        "sha": "ed4ffc464f2028fe50dfc6823f4eda78d34be7e6"
-    },
-    "LayoutLMv3Model": {
-        "tokenizer_classes": [
-            "LayoutLMv3Tokenizer",
-            "LayoutLMv3TokenizerFast"
-        ],
-        "processor_classes": [
-            "LayoutLMv3ImageProcessor"
-        ],
-        "model_classes": [
-            "LayoutLMv3Model",
-            "TFLayoutLMv3Model"
-        ],
-        "sha": "69725e5e2445e5c1c3aa8a2aa49cfd72e0a44565"
-    },
-    "LevitForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "LevitImageProcessor"
-        ],
-        "model_classes": [
-            "LevitForImageClassification"
-        ],
-        "sha": "5ae8ccaa1fe1c947cb8ae6499e4a150c668bb9f0"
-    },
-    "LevitForImageClassificationWithTeacher": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "LevitImageProcessor"
-        ],
-        "model_classes": [
-            "LevitForImageClassificationWithTeacher"
-        ],
-        "sha": "568cc0d965b9bd293f240e7724314db6d50f6722"
-    },
-    "LevitModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "LevitImageProcessor"
-        ],
-        "model_classes": [
-            "LevitModel"
-        ],
-        "sha": "172efa52b50c75c3b3e498fa638f55e65b2ebf87"
-    },
-    "LiltForQuestionAnswering": {
-        "tokenizer_classes": [
-            "LayoutLMv3Tokenizer",
-            "LayoutLMv3TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LiltForQuestionAnswering"
-        ],
-        "sha": "0a348441999e98ec003b29fc4d5a67ad22ee6ca2"
-    },
-    "LiltForSequenceClassification": {
-        "tokenizer_classes": [
-            "LayoutLMv3Tokenizer",
-            "LayoutLMv3TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LiltForSequenceClassification"
-        ],
-        "sha": "c53ab0ba33536fe564a4a1e4f1674d990c01b83a"
-    },
-    "LiltForTokenClassification": {
-        "tokenizer_classes": [
-            "LayoutLMv3Tokenizer",
-            "LayoutLMv3TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LiltForTokenClassification"
-        ],
-        "sha": "14f85076f9b3f7016917e324d51ebd22511a2ae5"
-    },
-    "LiltModel": {
-        "tokenizer_classes": [
-            "LayoutLMv3Tokenizer",
-            "LayoutLMv3TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LiltModel"
-        ],
-        "sha": "3f1166cc14c532388df7e82336a8e575a813bd3f"
-    },
-    "LongT5ForConditionalGeneration": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LongT5ForConditionalGeneration"
-        ],
-        "sha": "c685cbbe706ad5c9a28689631765726a1874dcc7"
-    },
-    "LongT5Model": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LongT5Model"
-        ],
-        "sha": "6b468e55e2490565e6155690201086ac00c72062"
-    },
-    "LongformerForMaskedLM": {
-        "tokenizer_classes": [
-            "LongformerTokenizer",
-            "LongformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LongformerForMaskedLM",
-            "TFLongformerForMaskedLM"
-        ],
-        "sha": "929d3bda9a1485d9bae41f9dbfc1d149c1c4e78e"
-    },
-    "LongformerForMultipleChoice": {
-        "tokenizer_classes": [
-            "LongformerTokenizer",
-            "LongformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LongformerForMultipleChoice",
-            "TFLongformerForMultipleChoice"
-        ],
-        "sha": "60b1ecac6b9385ce18c7e6978ab161cce8e7f9d4"
-    },
-    "LongformerForQuestionAnswering": {
-        "tokenizer_classes": [
-            "LongformerTokenizer",
-            "LongformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LongformerForQuestionAnswering",
-            "TFLongformerForQuestionAnswering"
-        ],
-        "sha": "be45ab1321b703f2200cbbcae560aaf2e2afef88"
-    },
-    "LongformerForSequenceClassification": {
-        "tokenizer_classes": [
-            "LongformerTokenizer",
-            "LongformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LongformerForSequenceClassification",
-            "TFLongformerForSequenceClassification"
-        ],
-        "sha": "8bc0de0b0f740bf397eb2770ec3ce3a24f3d7af9"
-    },
-    "LongformerForTokenClassification": {
-        "tokenizer_classes": [
-            "LongformerTokenizer",
-            "LongformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LongformerForTokenClassification",
-            "TFLongformerForTokenClassification"
-        ],
-        "sha": "efa33a9b6f47f0f7979af08ae8d04a5a7363a14b"
-    },
-    "LongformerModel": {
-        "tokenizer_classes": [
-            "LongformerTokenizer",
-            "LongformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LongformerModel",
-            "TFLongformerModel"
-        ],
-        "sha": "b023d531688e8655fc09300ac36742588efb3240"
-    },
-    "LukeForMaskedLM": {
-        "tokenizer_classes": [
-            "LukeTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LukeForMaskedLM"
-        ],
-        "sha": "954cf6cd2bf1f298a3956b10c36656c57387506d"
-    },
-    "LukeForMultipleChoice": {
-        "tokenizer_classes": [
-            "LukeTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LukeForMultipleChoice"
-        ],
-        "sha": "d1310a9174ad50d60b30ad6049e165deb2539034"
-    },
-    "LukeForQuestionAnswering": {
-        "tokenizer_classes": [
-            "LukeTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LukeForQuestionAnswering"
-        ],
-        "sha": "3ea38da4e32cb4e45bea82b2e81a8639aeba2c35"
-    },
-    "LukeForSequenceClassification": {
-        "tokenizer_classes": [
-            "LukeTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LukeForSequenceClassification"
-        ],
-        "sha": "b5b11248aeb4f5976379d15a977aeb2677e0c0f9"
-    },
-    "LukeForTokenClassification": {
-        "tokenizer_classes": [
-            "LukeTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LukeForTokenClassification"
-        ],
-        "sha": "8aab1a33ad26a344a6f4dfd68630e9661e174471"
-    },
-    "LukeModel": {
-        "tokenizer_classes": [
-            "LukeTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LukeModel"
-        ],
-        "sha": "ae23a674e7297d41f33c9af86e039757dfd2d531"
-    },
-    "LxmertForPreTraining": {
-        "tokenizer_classes": [
-            "LxmertTokenizer",
-            "LxmertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LxmertForPreTraining",
-            "TFLxmertForPreTraining"
-        ],
-        "sha": "7b0843403c187aef00f20d5087086468d9613d2c"
-    },
-    "LxmertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "LxmertTokenizer",
-            "LxmertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LxmertForQuestionAnswering"
-        ],
-        "sha": "27a74bd2cd156e46656c43ceb432c4deda0df5c1"
-    },
-    "LxmertModel": {
-        "tokenizer_classes": [
-            "LxmertTokenizer",
-            "LxmertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "LxmertModel",
-            "TFLxmertModel"
-        ],
-        "sha": "97612a0d6b14406ea9bfd7672e6974e0961cbef1"
-    },
-    "M2M100ForConditionalGeneration": {
-        "tokenizer_classes": [
-            "M2M100Tokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "M2M100ForConditionalGeneration"
-        ],
-        "sha": "32ac347092d51f658b41ffc111b67d49acdeab46"
-    },
-    "M2M100Model": {
-        "tokenizer_classes": [
-            "M2M100Tokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "M2M100Model"
-        ],
-        "sha": "e95c2ae168c7ba19f8114def40e1b1edd953b2f5"
-    },
-    "MBartForCausalLM": {
-        "tokenizer_classes": [
-            "MBartTokenizer",
-            "MBartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MBartForCausalLM"
-        ],
-        "sha": "a45044f8056328d20a764356eca3d0746a7a195e"
-    },
-    "MBartForConditionalGeneration": {
-        "tokenizer_classes": [
-            "MBartTokenizer",
-            "MBartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MBartForConditionalGeneration",
-            "TFMBartForConditionalGeneration"
-        ],
-        "sha": "171e918962d6c0ee56c6b070858e19e16c8dd09f"
-    },
-    "MBartForQuestionAnswering": {
-        "tokenizer_classes": [
-            "MBartTokenizer",
-            "MBartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MBartForQuestionAnswering"
-        ],
-        "sha": "1ee08565d24777335595e0d2940e454abdcff731"
-    },
-    "MBartForSequenceClassification": {
-        "tokenizer_classes": [
-            "MBartTokenizer",
-            "MBartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MBartForSequenceClassification"
-        ],
-        "sha": "53e9c88ecfa2475d27afe099ffa7a8bcdb7ef7e4"
-    },
-    "MBartModel": {
-        "tokenizer_classes": [
-            "MBartTokenizer",
-            "MBartTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MBartModel",
-            "TFMBartModel"
-        ],
-        "sha": "2d492b34d69dd63b411990d5c8bb692fd637e91c"
-    },
-    "MCTCTForCTC": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MCTCTFeatureExtractor"
-        ],
-        "model_classes": [
-            "MCTCTForCTC"
-        ],
-        "sha": "895a3d74f87b344b1f0a71eae4f085941d51b5cf"
-    },
-    "MCTCTModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MCTCTFeatureExtractor"
-        ],
-        "model_classes": [
-            "MCTCTModel"
-        ],
-        "sha": "ce73d5c2b6fe163de778697d7b0543bf00d7ffa8"
-    },
-    "MPNetForMaskedLM": {
-        "tokenizer_classes": [
-            "MPNetTokenizer",
-            "MPNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MPNetForMaskedLM",
-            "TFMPNetForMaskedLM"
-        ],
-        "sha": "50af96e7d0202aef86e396c136e4c4fde8afe183"
-    },
-    "MPNetForMultipleChoice": {
-        "tokenizer_classes": [
-            "MPNetTokenizer",
-            "MPNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MPNetForMultipleChoice",
-            "TFMPNetForMultipleChoice"
-        ],
-        "sha": "af4ff8bf296a3a51f5ab6cd9f56741e4c732487c"
-    },
-    "MPNetForQuestionAnswering": {
-        "tokenizer_classes": [
-            "MPNetTokenizer",
-            "MPNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MPNetForQuestionAnswering",
-            "TFMPNetForQuestionAnswering"
-        ],
-        "sha": "3e1a25c0d3243f78f81580c312ada3b39c06b428"
-    },
-    "MPNetForSequenceClassification": {
-        "tokenizer_classes": [
-            "MPNetTokenizer",
-            "MPNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MPNetForSequenceClassification",
-            "TFMPNetForSequenceClassification"
-        ],
-        "sha": "43da45c0a0d73c5a5567b4c7ec512ec5023e52dd"
-    },
-    "MPNetForTokenClassification": {
-        "tokenizer_classes": [
-            "MPNetTokenizer",
-            "MPNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MPNetForTokenClassification",
-            "TFMPNetForTokenClassification"
-        ],
-        "sha": "4e825eff24df533321ebab823eb66ce67e4ab3d9"
-    },
-    "MPNetModel": {
-        "tokenizer_classes": [
-            "MPNetTokenizer",
-            "MPNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MPNetModel",
-            "TFMPNetModel"
-        ],
-        "sha": "847c68344c2922e9a71fa8835b87a0f6f72b9f47"
-    },
-    "MarianForCausalLM": {
-        "tokenizer_classes": [
-            "MarianTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [],
-        "sha": "5fb205e6db8e18e3c6cdd4e4709be292ba4599f3"
-    },
-    "MarianMTModel": {
-        "tokenizer_classes": [
-            "MarianTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MarianMTModel",
-            "TFMarianMTModel"
-        ],
-        "sha": "0405f542b31561592231a86e3009d05256cbf49f"
-    },
-    "MarianModel": {
-        "tokenizer_classes": [
-            "MarianTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MarianModel",
-            "TFMarianModel"
-        ],
-        "sha": "3649748c0286c6d5179a7013a716f7314db182a8"
-    },
-    "MarkupLMForQuestionAnswering": {
-        "tokenizer_classes": [
-            "MarkupLMTokenizer",
-            "MarkupLMTokenizerFast"
-        ],
-        "processor_classes": [
-            "MarkupLMFeatureExtractor"
-        ],
-        "model_classes": [
-            "MarkupLMForQuestionAnswering"
-        ],
-        "sha": "c8bb9f93591d980362547b0bdca9f23ace2f383e"
-    },
-    "MarkupLMForSequenceClassification": {
-        "tokenizer_classes": [
-            "MarkupLMTokenizer",
-            "MarkupLMTokenizerFast"
-        ],
-        "processor_classes": [
-            "MarkupLMFeatureExtractor"
-        ],
-        "model_classes": [
-            "MarkupLMForSequenceClassification"
-        ],
-        "sha": "c2cb7245d68d76e0a5f993fc8a3de099ecebc68b"
-    },
-    "MarkupLMForTokenClassification": {
-        "tokenizer_classes": [
-            "MarkupLMTokenizer",
-            "MarkupLMTokenizerFast"
-        ],
-        "processor_classes": [
-            "MarkupLMFeatureExtractor"
-        ],
-        "model_classes": [
-            "MarkupLMForTokenClassification"
-        ],
-        "sha": "b9f924e82f400de0b34b46ee4ba276d686bd4890"
-    },
-    "MarkupLMModel": {
-        "tokenizer_classes": [
-            "MarkupLMTokenizer",
-            "MarkupLMTokenizerFast"
-        ],
-        "processor_classes": [
-            "MarkupLMFeatureExtractor"
-        ],
-        "model_classes": [
-            "MarkupLMModel"
-        ],
-        "sha": "9687ba29f1c59d978e3d4b0fa702031f88eff53b"
-    },
-    "Mask2FormerForUniversalSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "Mask2FormerImageProcessor"
-        ],
-        "model_classes": [
-            "Mask2FormerForUniversalSegmentation"
-        ],
-        "sha": "6429a7349527c9ef140ae691b83c47702cce1bc0"
-    },
-    "Mask2FormerModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "Mask2FormerImageProcessor"
-        ],
-        "model_classes": [
-            "Mask2FormerModel"
-        ],
-        "sha": "9bee8709204024b3669d503cdfe8890182f2a075"
-    },
-    "MaskFormerForInstanceSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MaskFormerImageProcessor"
-        ],
-        "model_classes": [
-            "MaskFormerForInstanceSegmentation"
-        ],
-        "sha": "f844aaa81f55cb199c115f1bf95c217a70685570"
-    },
-    "MaskFormerModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MaskFormerImageProcessor"
-        ],
-        "model_classes": [
-            "MaskFormerModel"
-        ],
-        "sha": "473b54a464bc0ccee29bc23b4f6610f32eec05af"
-    },
-    "MegaForCausalLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegaForCausalLM"
-        ],
-        "sha": "6642b9da860f8b62abcfb0660feabcebf6698418"
-    },
-    "MegaForMaskedLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegaForMaskedLM"
-        ],
-        "sha": "6b2d47ba03bec9e6f7eefdd4a67351fa191aae6f"
-    },
-    "MegaForMultipleChoice": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegaForMultipleChoice"
-        ],
-        "sha": "2b1e751da36a4410473eef07a62b09227a26d504"
-    },
-    "MegaForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegaForQuestionAnswering"
-        ],
-        "sha": "612acd9a53c351c42514adb3c04f2057d2870be7"
-    },
-    "MegaForSequenceClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegaForSequenceClassification"
-        ],
-        "sha": "4871572da1613b7e9cfd3640c6d1129af004eefb"
-    },
-    "MegaForTokenClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegaForTokenClassification"
-        ],
-        "sha": "450d3722c3b995215d06b9c12544c99f958581c7"
-    },
-    "MegaModel": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegaModel"
-        ],
-        "sha": "ca0862db27428893fe22f9bb5d2eb0875c2156f3"
-    },
-    "MegatronBertForCausalLM": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertForCausalLM"
-        ],
-        "sha": "ff08d05ef8f98fdccf1f01560ec6ec4adbc8a3e3"
-    },
-    "MegatronBertForMaskedLM": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertForMaskedLM"
-        ],
-        "sha": "2ed25e2681d26b51b404ef1347a385c5f2c86a9a"
-    },
-    "MegatronBertForMultipleChoice": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertForMultipleChoice"
-        ],
-        "sha": "1485af4b75f8f234d2b4b5aea50ab2ec55223a15"
-    },
-    "MegatronBertForNextSentencePrediction": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertForNextSentencePrediction"
-        ],
-        "sha": "52bc9ee1d5145344f66b088ed278f07ed3d90584"
-    },
-    "MegatronBertForPreTraining": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertForPreTraining"
-        ],
-        "sha": "e580d0efd54e1c92789e39b32929234e36ee427f"
-    },
-    "MegatronBertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertForQuestionAnswering"
-        ],
-        "sha": "7342ba042a3c30c15382d00fcb0521533fc43841"
-    },
-    "MegatronBertForSequenceClassification": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertForSequenceClassification"
-        ],
-        "sha": "6a7cd480511d817a1e221c8f7558c55a93baed1b"
-    },
-    "MegatronBertForTokenClassification": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertForTokenClassification"
-        ],
-        "sha": "8b5334b6ec5f025293ca861de474b57ca84bc005"
-    },
-    "MegatronBertModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MegatronBertModel"
-        ],
-        "sha": "f2457fbe535ba97ea13db049f53618b42e13f047"
-    },
-    "MgpstrForSceneTextRecognition": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MgpstrProcessor"
-        ],
-        "model_classes": [
-            "MgpstrForSceneTextRecognition"
-        ],
-        "sha": "f197d5bfa1fe27b5f28a6e6d4e3ad229b753450a"
-    },
-    "MobileBertForMaskedLM": {
-        "tokenizer_classes": [
-            "MobileBertTokenizer",
-            "MobileBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MobileBertForMaskedLM",
-            "TFMobileBertForMaskedLM"
-        ],
-        "sha": "d689e737d73ad23aed3aabd3177591fc827d1c62"
-    },
-    "MobileBertForMultipleChoice": {
-        "tokenizer_classes": [
-            "MobileBertTokenizer",
-            "MobileBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MobileBertForMultipleChoice",
-            "TFMobileBertForMultipleChoice"
-        ],
-        "sha": "403d1f88be7eb0c769ff3a8e57eab21cc3e75afb"
-    },
-    "MobileBertForNextSentencePrediction": {
-        "tokenizer_classes": [
-            "MobileBertTokenizer",
-            "MobileBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MobileBertForNextSentencePrediction",
-            "TFMobileBertForNextSentencePrediction"
-        ],
-        "sha": "b4d8836a0f259ee3bca9f230093836c9117c5e4d"
-    },
-    "MobileBertForPreTraining": {
-        "tokenizer_classes": [
-            "MobileBertTokenizer",
-            "MobileBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MobileBertForPreTraining",
-            "TFMobileBertForPreTraining"
-        ],
-        "sha": "fbaa13ea6f9fcebb9fde620dd009d12510440d17"
-    },
-    "MobileBertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "MobileBertTokenizer",
-            "MobileBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MobileBertForQuestionAnswering",
-            "TFMobileBertForQuestionAnswering"
-        ],
-        "sha": "ba6a55cf2daec55bfb220c9bab0bc4ad96510087"
-    },
-    "MobileBertForSequenceClassification": {
-        "tokenizer_classes": [
-            "MobileBertTokenizer",
-            "MobileBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MobileBertForSequenceClassification",
-            "TFMobileBertForSequenceClassification"
-        ],
-        "sha": "17ab35603bec351457e035eef2d0426538071f72"
-    },
-    "MobileBertForTokenClassification": {
-        "tokenizer_classes": [
-            "MobileBertTokenizer",
-            "MobileBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MobileBertForTokenClassification",
-            "TFMobileBertForTokenClassification"
-        ],
-        "sha": "dee83e820e6c4f069886a5d1875bf6775897313e"
-    },
-    "MobileBertModel": {
-        "tokenizer_classes": [
-            "MobileBertTokenizer",
-            "MobileBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MobileBertModel",
-            "TFMobileBertModel"
-        ],
-        "sha": "09b2db33ea798a762eeaf7e727e95f9ea8a6d14f"
-    },
-    "MobileNetV1ForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileNetV1ImageProcessor"
-        ],
-        "model_classes": [
-            "MobileNetV1ForImageClassification"
-        ],
-        "sha": "55023dbd0935f147bf1bccf960cea01ca07e0f0c"
-    },
-    "MobileNetV1Model": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileNetV1ImageProcessor"
-        ],
-        "model_classes": [
-            "MobileNetV1Model"
-        ],
-        "sha": "178bd24528147a028938d6ee5c7e65c969ea37b0"
-    },
-    "MobileNetV2ForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileNetV2ImageProcessor"
-        ],
-        "model_classes": [
-            "MobileNetV2ForImageClassification"
-        ],
-        "sha": "ff907f740cf9ea91bc3cdf403a94ae28fbb2548a"
-    },
-    "MobileNetV2ForSemanticSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileNetV2ImageProcessor"
-        ],
-        "model_classes": [
-            "MobileNetV2ForSemanticSegmentation"
-        ],
-        "sha": "48adbc340e42882f52b54d4f5dd045e16e9ef2d6"
-    },
-    "MobileNetV2Model": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileNetV2ImageProcessor"
-        ],
-        "model_classes": [
-            "MobileNetV2Model"
-        ],
-        "sha": "e876885828825472a80ef1796d89d60b901813ba"
-    },
-    "MobileViTForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileViTImageProcessor"
-        ],
-        "model_classes": [
-            "MobileViTForImageClassification",
-            "TFMobileViTForImageClassification"
-        ],
-        "sha": "7d0b31864f856e00f9e34e8c6781dcc7a8cdaf1e"
-    },
-    "MobileViTForSemanticSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileViTImageProcessor"
-        ],
-        "model_classes": [
-            "MobileViTForSemanticSegmentation",
-            "TFMobileViTForSemanticSegmentation"
-        ],
-        "sha": "215f727caa3c3fc94fa4df486aa706e5d99d4194"
-    },
-    "MobileViTModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileViTImageProcessor"
-        ],
-        "model_classes": [
-            "MobileViTModel",
-            "TFMobileViTModel"
-        ],
-        "sha": "b3a1452e7cb44b600b21ee14f3d5382366855a46"
-    },
-    "MobileViTV2ForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileViTImageProcessor"
-        ],
-        "model_classes": [
-            "MobileViTV2ForImageClassification"
-        ],
-        "sha": "25752b0967ad594341d1b685401450d7f698433c"
-    },
-    "MobileViTV2ForSemanticSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileViTImageProcessor"
-        ],
-        "model_classes": [
-            "MobileViTV2ForSemanticSegmentation"
-        ],
-        "sha": "13b953f50be33219d55a12f1098be38b88000897"
-    },
-    "MobileViTV2Model": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "MobileViTImageProcessor"
-        ],
-        "model_classes": [
-            "MobileViTV2Model"
-        ],
-        "sha": "2f46357659db2d6d54d870e28073deeea1c8cb64"
-    },
-    "MptForCausalLM": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MptForCausalLM"
-        ],
-        "sha": "500c869b956c65f6b1a7b4867727f124c6f5728a"
-    },
-    "MptForQuestionAnswering": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MptForQuestionAnswering"
-        ],
-        "sha": "6ee46572bf61eb5e7dbbdaf00b73c4d37efc42d9"
-    },
-    "MptForSequenceClassification": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MptForSequenceClassification"
-        ],
-        "sha": "f0b9153413b5dfceeb96b67d4b0f22c94bbaf64a"
-    },
-    "MptForTokenClassification": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MptForTokenClassification"
-        ],
-        "sha": "3f7c3ccd67cd0b2aae56d37613429a64ef813246"
-    },
-    "MptModel": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MptModel"
-        ],
-        "sha": "ea747f234556661b0c8b84a626f267066ce586bf"
-    },
-    "MraForMaskedLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MraForMaskedLM"
-        ],
-        "sha": "c00ee46cfd2b8fed29cc37f0a4ead40ad51a439c"
-    },
-    "MraForMultipleChoice": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MraForMultipleChoice"
-        ],
-        "sha": "f397469ba8109f64dab2d75335ea7bf0c2dbeb74"
-    },
-    "MraForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MraForQuestionAnswering"
-        ],
-        "sha": "c2ed75acd20e5440a76d6504d9a3ebc2513011f0"
-    },
-    "MraForSequenceClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MraForSequenceClassification"
-        ],
-        "sha": "f47672d3708508bda7774215bee44a92ec16ab2f"
-    },
-    "MraForTokenClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MraForTokenClassification"
-        ],
-        "sha": "f0961ab5818bca473607fb94b391c186dc1d3492"
-    },
-    "MraModel": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MraModel"
-        ],
-        "sha": "315f34f30bcc4b0b66b11987726df2a80c50e271"
-    },
-    "MvpForCausalLM": {
-        "tokenizer_classes": [
-            "MvpTokenizer",
-            "MvpTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MvpForCausalLM"
-        ],
-        "sha": "105e5f2c8a0f20d404cb71795539cda5dd49716d"
-    },
-    "MvpForConditionalGeneration": {
-        "tokenizer_classes": [
-            "MvpTokenizer",
-            "MvpTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MvpForConditionalGeneration"
-        ],
-        "sha": "b0b706f14b2f8aae288cba30ae0064e0be7e888b"
-    },
-    "MvpForQuestionAnswering": {
-        "tokenizer_classes": [
-            "MvpTokenizer",
-            "MvpTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MvpForQuestionAnswering"
-        ],
-        "sha": "82f152b36a40a4c22edcb146e6eaec636d84fa2d"
-    },
-    "MvpForSequenceClassification": {
-        "tokenizer_classes": [
-            "MvpTokenizer",
-            "MvpTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MvpForSequenceClassification"
-        ],
-        "sha": "506b68544d064001929ee9e6db3752e62972a6aa"
-    },
-    "MvpModel": {
-        "tokenizer_classes": [
-            "MvpTokenizer",
-            "MvpTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "MvpModel"
-        ],
-        "sha": "3f4653184721a2bc029b27706d335ef7ddd219d5"
-    },
-    "NatBackbone": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "NatBackbone"
-        ],
-        "sha": "d5cc5eccba4da609c82e9f5c649301b9f9fee9fb"
-    },
-    "NatForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "NatForImageClassification"
-        ],
-        "sha": "2ff4c9e73c49c392c02a467e87b5511fd924242a"
-    },
-    "NatModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "NatModel"
-        ],
-        "sha": "75e9756bb94d0ccdce98a8e963eeecbc66f9d573"
-    },
-    "NezhaForMaskedLM": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NezhaForMaskedLM"
-        ],
-        "sha": "5991cca4b78f0ed7299259a71f3eeed3f3452b72"
-    },
-    "NezhaForMultipleChoice": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NezhaForMultipleChoice"
-        ],
-        "sha": "0f6e9ec791d85ad4503acdec50b3a120f984016b"
-    },
-    "NezhaForNextSentencePrediction": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NezhaForNextSentencePrediction"
-        ],
-        "sha": "9a34316c14ec8ecc98ff08e46760915c80098a57"
-    },
-    "NezhaForPreTraining": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NezhaForPreTraining"
-        ],
-        "sha": "6259db427a0073061de352ea819d38a74798edd7"
-    },
-    "NezhaForQuestionAnswering": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NezhaForQuestionAnswering"
-        ],
-        "sha": "31c6a34e85ae8c41294e0f4ef25044e00e511c4d"
-    },
-    "NezhaForSequenceClassification": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NezhaForSequenceClassification"
-        ],
-        "sha": "db057c308ba2e05f223404de11e1816ce4bd62a9"
-    },
-    "NezhaForTokenClassification": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NezhaForTokenClassification"
-        ],
-        "sha": "235f4e10b4a59709650c2bece3e342ec153d9cfc"
-    },
-    "NezhaModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NezhaModel"
-        ],
-        "sha": "80e05ba7c55bcdd7f4d1387ef9a09a7a8e95b5ac"
-    },
-    "NllbMoeForConditionalGeneration": {
-        "tokenizer_classes": [
-            "NllbTokenizer",
-            "NllbTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NllbMoeForConditionalGeneration"
-        ],
-        "sha": "2a7f87dffe826af3d52086888f3f3773246e5528"
-    },
-    "NllbMoeModel": {
-        "tokenizer_classes": [
-            "NllbTokenizer",
-            "NllbTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NllbMoeModel"
-        ],
-        "sha": "9f7a2261eed4658e1aa5623be4672ba64bee7da5"
-    },
-    "NystromformerForMaskedLM": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NystromformerForMaskedLM"
-        ],
-        "sha": "37036847783f1e65e81ecd43803270a1ecb276f3"
-    },
-    "NystromformerForMultipleChoice": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NystromformerForMultipleChoice"
-        ],
-        "sha": "42a077d5ab6830e20560466eaccc525eff10c3ae"
-    },
-    "NystromformerForQuestionAnswering": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NystromformerForQuestionAnswering"
-        ],
-        "sha": "1cfaf79051731824db4f09989f093f87f4fceec5"
-    },
-    "NystromformerForSequenceClassification": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NystromformerForSequenceClassification"
-        ],
-        "sha": "d75231203066df41e9b6b25dbee9ad40e8515c18"
-    },
-    "NystromformerForTokenClassification": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NystromformerForTokenClassification"
-        ],
-        "sha": "5a499dc96e106bf41fc9166f2ad06527ec7ca14e"
-    },
-    "NystromformerModel": {
-        "tokenizer_classes": [
-            "AlbertTokenizer",
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "NystromformerModel"
-        ],
-        "sha": "2b6adb37ec473b15d71e2eb459acea08df6940ce"
-    },
-    "OPTForCausalLM": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "OPTForCausalLM",
-            "TFOPTForCausalLM"
-        ],
-        "sha": "190d1f4fc0011d2eaeaa05282e0fbd2445e4b11f"
-    },
-    "OPTForQuestionAnswering": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "OPTForQuestionAnswering"
-        ],
-        "sha": "0fa9277ce10dbc3d0922b354befb684a136af00b"
-    },
-    "OPTForSequenceClassification": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "OPTForSequenceClassification"
-        ],
-        "sha": "784ab288ab7280b1853ee400ef10ee2a965df352"
-    },
-    "OPTModel": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "OPTModel",
-            "TFOPTModel"
-        ],
-        "sha": "901d92b8f51edb0ec9614cb185fb66a8b5d364c3"
-    },
-    "OneFormerForUniversalSegmentation": {
-        "tokenizer_classes": [
-            "CLIPTokenizer",
-            "CLIPTokenizerFast"
-        ],
-        "processor_classes": [
-            "OneFormerImageProcessor"
-        ],
-        "model_classes": [
-            "OneFormerForUniversalSegmentation"
-        ],
-        "sha": "fee1cfd676acc40f09017702ddac6504f3090d14"
-    },
-    "OneFormerModel": {
-        "tokenizer_classes": [
-            "CLIPTokenizer",
-            "CLIPTokenizerFast"
-        ],
-        "processor_classes": [
-            "OneFormerImageProcessor"
-        ],
-        "model_classes": [
-            "OneFormerModel"
-        ],
-        "sha": "4163a79328c78f93ec57942598698a138c19a577"
-    },
-    "OpenAIGPTForSequenceClassification": {
-        "tokenizer_classes": [
-            "OpenAIGPTTokenizer",
-            "OpenAIGPTTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "OpenAIGPTForSequenceClassification",
-            "TFOpenAIGPTForSequenceClassification"
-        ],
-        "sha": "c513f7f952935085f7573bf70a1ac3ad8f33434c"
-    },
-    "OpenAIGPTLMHeadModel": {
-        "tokenizer_classes": [
-            "OpenAIGPTTokenizer",
-            "OpenAIGPTTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "OpenAIGPTLMHeadModel",
-            "TFOpenAIGPTLMHeadModel"
-        ],
-        "sha": "33f59ecd860f7a998483ec7631fe32d257235461"
-    },
-    "OpenAIGPTModel": {
-        "tokenizer_classes": [
-            "OpenAIGPTTokenizer",
-            "OpenAIGPTTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "OpenAIGPTModel",
-            "TFOpenAIGPTModel"
-        ],
-        "sha": "00f6ec0a3a5276af71d08a26199e0ccbf2556fc9"
-    },
-    "OwlViTForObjectDetection": {
-        "tokenizer_classes": [
-            "CLIPTokenizer",
-            "CLIPTokenizerFast"
-        ],
-        "processor_classes": [
-            "OwlViTImageProcessor"
-        ],
-        "model_classes": [
-            "OwlViTForObjectDetection"
-        ],
-        "sha": "af958c9164f23d0f12921a8edf687f9aaa6af90e"
-    },
-    "OwlViTModel": {
-        "tokenizer_classes": [
-            "CLIPTokenizer",
-            "CLIPTokenizerFast"
-        ],
-        "processor_classes": [
-            "OwlViTImageProcessor"
-        ],
-        "model_classes": [
-            "OwlViTModel"
-        ],
-        "sha": "f0e27b2b4e53ba70e05d13dcfea8e85272b292a5"
-    },
-    "PLBartForCausalLM": {
-        "tokenizer_classes": [
-            "PLBartTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PLBartForCausalLM"
-        ],
-        "sha": "6ee51133246dbdb18fc3681ebd62d21e421b9bb4"
-    },
-    "PLBartForConditionalGeneration": {
-        "tokenizer_classes": [
-            "PLBartTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PLBartForConditionalGeneration"
-        ],
-        "sha": "ba191d28f4678d20b4dfed5fca5944018282cf20"
-    },
-    "PLBartForSequenceClassification": {
-        "tokenizer_classes": [
-            "PLBartTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PLBartForSequenceClassification"
-        ],
-        "sha": "02063b3d9707fcff619a4e37a0d6e58f76e39b18"
-    },
-    "PLBartModel": {
-        "tokenizer_classes": [
-            "PLBartTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PLBartModel"
-        ],
-        "sha": "cfbba29169b3f40d800403fc1b53982e1f88c5f8"
-    },
-    "PegasusForCausalLM": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PegasusForCausalLM"
-        ],
-        "sha": "6e685a698302a3ba33e5379d3a37eb0bc1ae2f70"
-    },
-    "PegasusForConditionalGeneration": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PegasusForConditionalGeneration",
-            "TFPegasusForConditionalGeneration"
-        ],
-        "sha": "15e58ee2ebc14b6e80ef2891259057ee5f049be2"
-    },
-    "PegasusModel": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PegasusModel",
-            "TFPegasusModel"
-        ],
-        "sha": "fa36b24523db411ef77903453346b8be81ef73fe"
-    },
-    "PegasusXForConditionalGeneration": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PegasusXForConditionalGeneration"
-        ],
-        "sha": "7588a8120f26a36c1687c14bdf1e9f9656891c1a"
-    },
-    "PegasusXModel": {
-        "tokenizer_classes": [
-            "PegasusTokenizer",
-            "PegasusTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "PegasusXModel"
-        ],
-        "sha": "a0bdff627416ac3c39c22d081f5d88d8b8fd99cc"
-    },
-    "PerceiverForImageClassificationConvProcessing": {
-        "tokenizer_classes": [
-            "PerceiverTokenizer"
-        ],
-        "processor_classes": [
-            "PerceiverImageProcessor"
-        ],
-        "model_classes": [
-            "PerceiverForImageClassificationConvProcessing"
-        ],
-        "sha": "2c1e5e62ebc9d0c931adc8c665fb05bde6c1c1f1"
-    },
-    "PerceiverForImageClassificationFourier": {
-        "tokenizer_classes": [
-            "PerceiverTokenizer"
-        ],
-        "processor_classes": [
-            "PerceiverImageProcessor"
-        ],
-        "model_classes": [
-            "PerceiverForImageClassificationFourier"
-        ],
-        "sha": "88da41b8851b76b8be0dacdb3de023db02bb031a"
-    },
-    "PerceiverForImageClassificationLearned": {
-        "tokenizer_classes": [
-            "PerceiverTokenizer"
-        ],
-        "processor_classes": [
-            "PerceiverImageProcessor"
-        ],
-        "model_classes": [
-            "PerceiverForImageClassificationLearned"
-        ],
-        "sha": "879bd1fa38d3baddb027bb2cacba2d160a741375"
-    },
-    "PerceiverForMaskedLM": {
-        "tokenizer_classes": [
-            "PerceiverTokenizer"
-        ],
-        "processor_classes": [
-            "PerceiverImageProcessor"
-        ],
-        "model_classes": [
-            "PerceiverForMaskedLM"
-        ],
-        "sha": "1d2459cbd281ef72da5682e65102aaca96183045"
-    },
-    "PerceiverForSequenceClassification": {
-        "tokenizer_classes": [
-            "PerceiverTokenizer"
-        ],
-        "processor_classes": [
-            "PerceiverImageProcessor"
-        ],
-        "model_classes": [
-            "PerceiverForSequenceClassification"
-        ],
-        "sha": "576f1f96348f0343458499fbf53d4102b5c0f2ff"
-    },
-    "PerceiverModel": {
-        "tokenizer_classes": [
-            "PerceiverTokenizer"
-        ],
-        "processor_classes": [
-            "PerceiverImageProcessor"
-        ],
-        "model_classes": [
-            "PerceiverModel"
-        ],
-        "sha": "83ec4d2d61ed62525ee033e13d144817beb29d19"
-    },
-    "Pix2StructForConditionalGeneration": {
-        "tokenizer_classes": [
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [
-            "Pix2StructImageProcessor",
-            "Pix2StructProcessor"
-        ],
-        "model_classes": [],
-        "sha": "42b3de00ad535076c4893e4ac5ae2d2748cc4ccb"
-    },
-    "PoolFormerForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "PoolFormerImageProcessor"
-        ],
-        "model_classes": [
-            "PoolFormerForImageClassification"
-        ],
-        "sha": "ef04de5a6896100d457fb9553dd9789c09cca98e"
-    },
-    "PoolFormerModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "PoolFormerImageProcessor"
-        ],
-        "model_classes": [
-            "PoolFormerModel"
-        ],
-        "sha": "e8037215ebdbf795329ef6525cdc6aa547f04ace"
-    },
-    "ProphetNetForCausalLM": {
-        "tokenizer_classes": [
-            "ProphetNetTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ProphetNetForCausalLM"
-        ],
-        "sha": "d40b1e75bbc5ea0839563457aff6eee5bc0bb03e"
-    },
-    "ProphetNetForConditionalGeneration": {
-        "tokenizer_classes": [
-            "ProphetNetTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ProphetNetForConditionalGeneration"
-        ],
-        "sha": "d842875c41278032af39c03c66902786bb5ff2c7"
-    },
-    "ProphetNetModel": {
-        "tokenizer_classes": [
-            "ProphetNetTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ProphetNetModel"
-        ],
-        "sha": "f1ddbbcc768c7ba54c4d75b319540c1635e65937"
-    },
-    "PvtForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "PvtImageProcessor"
-        ],
-        "model_classes": [
-            "PvtForImageClassification"
-        ],
-        "sha": "589b37bd6941aff6dd248259f9eee3c422a41fde"
-    },
-    "PvtModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "PvtImageProcessor"
-        ],
-        "model_classes": [
-            "PvtModel"
-        ],
-        "sha": "c40765c382515ae627652d60e9077b6478448d48"
-    },
-    "ReformerForMaskedLM": {
-        "tokenizer_classes": [
-            "ReformerTokenizer",
-            "ReformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ReformerForMaskedLM"
-        ],
-        "sha": "1e6431e42c676b525e3215e9e3cc8f1404f9f82b"
-    },
-    "ReformerForQuestionAnswering": {
-        "tokenizer_classes": [
-            "ReformerTokenizer",
-            "ReformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ReformerForQuestionAnswering"
-        ],
-        "sha": "62b43977f244474bd6982c6327d0c57310258fcd"
-    },
-    "ReformerForSequenceClassification": {
-        "tokenizer_classes": [
-            "ReformerTokenizer",
-            "ReformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ReformerForSequenceClassification"
-        ],
-        "sha": "67bd534a990a7dcfa02406987e7f066caa2a30e8"
-    },
-    "ReformerModel": {
-        "tokenizer_classes": [
-            "ReformerTokenizer",
-            "ReformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "ReformerModel"
-        ],
-        "sha": "a34ddb1389067448e9bc1323de674951cfb4cff1"
-    },
-    "ReformerModelWithLMHead": {
-        "tokenizer_classes": [
-            "ReformerTokenizer",
-            "ReformerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [],
-        "sha": "e7a8addaea8407d4c55e144e48aee04be6cca618"
-    },
-    "RegNetForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "RegNetForImageClassification",
-            "TFRegNetForImageClassification"
-        ],
-        "sha": "5ec67c84fc7944c0c5b386bd26820bc4d1f3b32a"
-    },
-    "RegNetModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "RegNetModel",
-            "TFRegNetModel"
-        ],
-        "sha": "72375e1401dc8271d4abb6295c9cee376f7b8f1a"
-    },
-    "RemBertForCausalLM": {
-        "tokenizer_classes": [
-            "RemBertTokenizer",
-            "RemBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RemBertForCausalLM",
-            "TFRemBertForCausalLM"
-        ],
-        "sha": "8d9ae3d74a0e0a8958b4ee8c9dca3632abf52ef9"
-    },
-    "RemBertForMaskedLM": {
-        "tokenizer_classes": [
-            "RemBertTokenizer",
-            "RemBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RemBertForMaskedLM",
-            "TFRemBertForMaskedLM"
-        ],
-        "sha": "b7c27d01e1cc3bef9ddd6a78627d700b3bffd759"
-    },
-    "RemBertForMultipleChoice": {
-        "tokenizer_classes": [
-            "RemBertTokenizer",
-            "RemBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RemBertForMultipleChoice",
-            "TFRemBertForMultipleChoice"
-        ],
-        "sha": "2fe192677b9740cf24dd559339d46925e8ac23d4"
-    },
-    "RemBertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RemBertTokenizer",
-            "RemBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RemBertForQuestionAnswering",
-            "TFRemBertForQuestionAnswering"
-        ],
-        "sha": "22b8ba44681b96292a1cf7f6df4ba6bb7937ec6e"
-    },
-    "RemBertForSequenceClassification": {
-        "tokenizer_classes": [
-            "RemBertTokenizer",
-            "RemBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RemBertForSequenceClassification",
-            "TFRemBertForSequenceClassification"
-        ],
-        "sha": "20f3e89341ea15266d2685a8798142fba03c3f98"
-    },
-    "RemBertForTokenClassification": {
-        "tokenizer_classes": [
-            "RemBertTokenizer",
-            "RemBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RemBertForTokenClassification",
-            "TFRemBertForTokenClassification"
-        ],
-        "sha": "15712ff753708da3cf0550e76e73a5d0bba7784e"
-    },
-    "RemBertModel": {
-        "tokenizer_classes": [
-            "RemBertTokenizer",
-            "RemBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RemBertModel",
-            "TFRemBertModel"
-        ],
-        "sha": "59cc6d099b1ded0aaead8684457415b129f79e86"
-    },
-    "ResNetBackbone": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ResNetBackbone"
-        ],
-        "sha": "c84a6bcf8af4b6a3403dea3cf4c55965ac39f239"
-    },
-    "ResNetForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ResNetForImageClassification",
-            "TFResNetForImageClassification"
-        ],
-        "sha": "34a180ad24d80811d420d7aa4fbec4a17751aaf8"
-    },
-    "ResNetModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "ResNetModel",
-            "TFResNetModel"
-        ],
-        "sha": "fafa6cdf9986c6cfbae360596b3574162430bcd3"
-    },
-    "RoCBertForCausalLM": {
-        "tokenizer_classes": [
-            "RoCBertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoCBertForCausalLM"
-        ],
-        "sha": "194d8dafc4f4142f8d31e6b4be14b55d812f923b"
-    },
-    "RoCBertForMaskedLM": {
-        "tokenizer_classes": [
-            "RoCBertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoCBertForMaskedLM"
-        ],
-        "sha": "8bc285f32f3b932dbd56ddf91b1170734d638eeb"
-    },
-    "RoCBertForMultipleChoice": {
-        "tokenizer_classes": [
-            "RoCBertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoCBertForMultipleChoice"
-        ],
-        "sha": "bb54e5ae021d728022d34b12fee3f087d9486af9"
-    },
-    "RoCBertForPreTraining": {
-        "tokenizer_classes": [
-            "RoCBertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoCBertForPreTraining"
-        ],
-        "sha": "86ebbd5b0bc84660ad7f505082eff19b86c137c8"
-    },
-    "RoCBertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RoCBertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoCBertForQuestionAnswering"
-        ],
-        "sha": "1bfc2dc3d6e76170e6dca1ff32a54a0887ff28a3"
-    },
-    "RoCBertForSequenceClassification": {
-        "tokenizer_classes": [
-            "RoCBertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoCBertForSequenceClassification"
-        ],
-        "sha": "c329038802241f454273894128fea38b60f7c739"
-    },
-    "RoCBertForTokenClassification": {
-        "tokenizer_classes": [
-            "RoCBertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoCBertForTokenClassification"
-        ],
-        "sha": "afe5ec22c2ad1d9ff6e3e64c87eb7555faaa936d"
-    },
-    "RoCBertModel": {
-        "tokenizer_classes": [
-            "RoCBertTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoCBertModel"
-        ],
-        "sha": "29de5580d5f5d3461a88673e7b4c492a9d8a67a4"
-    },
-    "RoFormerForCausalLM": {
-        "tokenizer_classes": [
-            "RoFormerTokenizer",
-            "RoFormerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoFormerForCausalLM",
-            "TFRoFormerForCausalLM"
-        ],
-        "sha": "6e074219c6dd8f8b221bbfda64fba100f729f88d"
-    },
-    "RoFormerForMaskedLM": {
-        "tokenizer_classes": [
-            "RoFormerTokenizer",
-            "RoFormerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoFormerForMaskedLM",
-            "TFRoFormerForMaskedLM"
-        ],
-        "sha": "a3a4d05f9b29601553a77244f2adcf8194f9367c"
-    },
-    "RoFormerForMultipleChoice": {
-        "tokenizer_classes": [
-            "RoFormerTokenizer",
-            "RoFormerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoFormerForMultipleChoice",
-            "TFRoFormerForMultipleChoice"
-        ],
-        "sha": "aca3999a1d14f09644faed44e2cdfb28ed68a3d3"
-    },
-    "RoFormerForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RoFormerTokenizer",
-            "RoFormerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoFormerForQuestionAnswering",
-            "TFRoFormerForQuestionAnswering"
-        ],
-        "sha": "b8a20b3a788f178b9ef64e2eb9587f693dca1b69"
-    },
-    "RoFormerForSequenceClassification": {
-        "tokenizer_classes": [
-            "RoFormerTokenizer",
-            "RoFormerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoFormerForSequenceClassification",
-            "TFRoFormerForSequenceClassification"
-        ],
-        "sha": "d092e2d5e62012bf4ec921e763b37865d6189216"
-    },
-    "RoFormerForTokenClassification": {
-        "tokenizer_classes": [
-            "RoFormerTokenizer",
-            "RoFormerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoFormerForTokenClassification",
-            "TFRoFormerForTokenClassification"
-        ],
-        "sha": "85d3a17062e1f3e0539abfe738a88203e25349b6"
-    },
-    "RoFormerModel": {
-        "tokenizer_classes": [
-            "RoFormerTokenizer",
-            "RoFormerTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RoFormerModel",
-            "TFRoFormerModel"
-        ],
-        "sha": "22e7df2f4cd66caf449f2342f63d176005afccc9"
-    },
-    "RobertaForCausalLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaForCausalLM",
-            "TFRobertaForCausalLM"
-        ],
-        "sha": "5d1d24d56f9735402e50a2ea513ffde44487733e"
-    },
-    "RobertaForMaskedLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaForMaskedLM",
-            "TFRobertaForMaskedLM"
-        ],
-        "sha": "b21c9daf0b3b66530bf5d45d67df5ec392b5059c"
-    },
-    "RobertaForMultipleChoice": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaForMultipleChoice",
-            "TFRobertaForMultipleChoice"
-        ],
-        "sha": "10020d9546d4d7318f4d514fe13daaad07e6269f"
-    },
-    "RobertaForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaForQuestionAnswering",
-            "TFRobertaForQuestionAnswering"
-        ],
-        "sha": "eea4a81306891746bac9e7715f805a2d9dbf4be7"
-    },
-    "RobertaForSequenceClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaForSequenceClassification",
-            "TFRobertaForSequenceClassification"
-        ],
-        "sha": "6a6f53fc6ab98e29ed539e76b1cb76d25a2cd720"
-    },
-    "RobertaForTokenClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaForTokenClassification",
-            "TFRobertaForTokenClassification"
-        ],
-        "sha": "9190044c4091eb0d98ae7638c453e24846bca5d7"
-    },
-    "RobertaModel": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaModel",
-            "TFRobertaModel"
-        ],
-        "sha": "181a0b8a7ad24500ec327ad07ddb225f0680ac0a"
-    },
-    "RobertaPreLayerNormForCausalLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaPreLayerNormForCausalLM",
-            "TFRobertaPreLayerNormForCausalLM"
-        ],
-        "sha": "73b6d4531b41f295a5d310d7aa44736004a59865"
-    },
-    "RobertaPreLayerNormForMaskedLM": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaPreLayerNormForMaskedLM",
-            "TFRobertaPreLayerNormForMaskedLM"
-        ],
-        "sha": "a61723c77e5ab7adc95285e7823a0a49b99af395"
-    },
-    "RobertaPreLayerNormForMultipleChoice": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaPreLayerNormForMultipleChoice",
-            "TFRobertaPreLayerNormForMultipleChoice"
-        ],
-        "sha": "3dcfa62e0771358c60232a18135bfe7c7f6d715e"
-    },
-    "RobertaPreLayerNormForQuestionAnswering": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaPreLayerNormForQuestionAnswering",
-            "TFRobertaPreLayerNormForQuestionAnswering"
-        ],
-        "sha": "a8e76a5a50f7df60055e5ed6a1c3af2e7d34cf01"
-    },
-    "RobertaPreLayerNormForSequenceClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaPreLayerNormForSequenceClassification",
-            "TFRobertaPreLayerNormForSequenceClassification"
-        ],
-        "sha": "7509cb0286d146ef2fc6beb8867ae31b92fb1b16"
-    },
-    "RobertaPreLayerNormForTokenClassification": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaPreLayerNormForTokenClassification",
-            "TFRobertaPreLayerNormForTokenClassification"
-        ],
-        "sha": "3ad5814ba126b41e18c1978c970e396fab6da9bf"
-    },
-    "RobertaPreLayerNormModel": {
-        "tokenizer_classes": [
-            "RobertaTokenizer",
-            "RobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RobertaPreLayerNormModel",
-            "TFRobertaPreLayerNormModel"
-        ],
-        "sha": "4830db38fd310404c5ab70bd00684eca0bc06ca8"
-    },
-    "RwkvForCausalLM": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RwkvForCausalLM"
-        ],
-        "sha": "2f452fd46b39e39b1a6a95fa1d8232405bbb3e96"
-    },
-    "RwkvModel": {
-        "tokenizer_classes": [
-            "GPTNeoXTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "RwkvModel"
-        ],
-        "sha": "88a52c9437dc3c06f65a8252490be7eb91197804"
-    },
-    "SEWDForCTC": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "SEWDForCTC"
-        ],
-        "sha": "5c7495c77ae9e0f12c0de05d3a5fb95bdcd91768"
-    },
-    "SEWDForSequenceClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "SEWDForSequenceClassification"
-        ],
-        "sha": "d6cbf1164ce1999fdaf3deeb7a6eba19a3b1f873"
-    },
-    "SEWDModel": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "SEWDModel"
-        ],
-        "sha": "dde4e02219449f149bb3403bbeae127cafaf9c79"
-    },
-    "SEWForCTC": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "SEWForCTC"
-        ],
-        "sha": "4477c7a277059fba08772acf91cf3e3dd3cb073b"
-    },
-    "SEWForSequenceClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "SEWForSequenceClassification"
-        ],
-        "sha": "3b90fbb1c0c3848fed18f91a0169bb297a3e6619"
-    },
-    "SEWModel": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "SEWModel"
-        ],
-        "sha": "0a0fbb844eeefa0dce62bd05db30a2bb91e5dc88"
-    },
-    "SamModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "SamImageProcessor"
-        ],
-        "model_classes": [
-            "SamModel",
-            "TFSamModel"
-        ],
-        "sha": "eca8651bc84e5ac3b1b62e784b744a6bd1b82575"
-    },
-    "SegformerForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "SegformerImageProcessor"
-        ],
-        "model_classes": [
-            "SegformerForImageClassification",
-            "TFSegformerForImageClassification"
-        ],
-        "sha": "c566ae0ed382be4ed61ed6dacffa2ba663e9cc19"
-    },
-    "SegformerForSemanticSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "SegformerImageProcessor"
-        ],
-        "model_classes": [
-            "SegformerForSemanticSegmentation",
-            "TFSegformerForSemanticSegmentation"
-        ],
-        "sha": "b73798972cdf24daafa858994713aca60e2bf90d"
-    },
-    "SegformerModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "SegformerImageProcessor"
-        ],
-        "model_classes": [
-            "SegformerModel",
-            "TFSegformerModel"
-        ],
-        "sha": "3d4ba8ed2bdf801e6afa855b9d77893f2b7f9e10"
-    },
-    "Speech2TextForConditionalGeneration": {
-        "tokenizer_classes": [
-            "Speech2TextTokenizer"
-        ],
-        "processor_classes": [
-            "Speech2TextFeatureExtractor"
-        ],
-        "model_classes": [
-            "Speech2TextForConditionalGeneration",
-            "TFSpeech2TextForConditionalGeneration"
-        ],
-        "sha": "1da80293ec78762e136cf6dd64b652693f9ab364"
-    },
-    "Speech2TextModel": {
-        "tokenizer_classes": [
-            "Speech2TextTokenizer"
-        ],
-        "processor_classes": [
-            "Speech2TextFeatureExtractor"
-        ],
-        "model_classes": [
-            "Speech2TextModel",
-            "TFSpeech2TextModel"
-        ],
-        "sha": "7c6e63bd0c15dd99ef01573d4c43f90e4920cc91"
-    },
-    "SpeechEncoderDecoderModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "SpeechEncoderDecoderModel"
-        ],
-        "sha": "78602ae0857728e95de4042bdca8a31ef818890a"
-    },
-    "SpeechT5ForSpeechToText": {
-        "tokenizer_classes": [
-            "SpeechT5Tokenizer"
-        ],
-        "processor_classes": [
-            "SpeechT5FeatureExtractor"
-        ],
-        "model_classes": [
-            "SpeechT5ForSpeechToText"
-        ],
-        "sha": "d46f0a83324e5865420a27a738ef203292de3479"
-    },
-    "SpeechT5Model": {
-        "tokenizer_classes": [
-            "SpeechT5Tokenizer"
-        ],
-        "processor_classes": [
-            "SpeechT5FeatureExtractor"
-        ],
-        "model_classes": [
-            "SpeechT5Model"
-        ],
-        "sha": "7b248f77ca88ffddcdb538e772f6de63a86a4f9b"
-    },
-    "SplinterForPreTraining": {
-        "tokenizer_classes": [
-            "SplinterTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SplinterForPreTraining"
-        ],
-        "sha": "e8a94efa740f1d685fa553f49132c6f022de5389"
-    },
-    "SplinterForQuestionAnswering": {
-        "tokenizer_classes": [
-            "SplinterTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SplinterForQuestionAnswering"
-        ],
-        "sha": "d038b7b683face4a361ab0f474d8a5b111c44c4d"
-    },
-    "SplinterModel": {
-        "tokenizer_classes": [
-            "SplinterTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SplinterModel"
-        ],
-        "sha": "a35b13cbb7faba46dc265761bb839267eb53d248"
-    },
-    "SqueezeBertForMaskedLM": {
-        "tokenizer_classes": [
-            "SqueezeBertTokenizer",
-            "SqueezeBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SqueezeBertForMaskedLM"
-        ],
-        "sha": "33ce239408c22d2c98be63c9ab4607ef9ceb6d49"
-    },
-    "SqueezeBertForMultipleChoice": {
-        "tokenizer_classes": [
-            "SqueezeBertTokenizer",
-            "SqueezeBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SqueezeBertForMultipleChoice"
-        ],
-        "sha": "7e9e666896420c7839e27dcb280981d034ba4da5"
-    },
-    "SqueezeBertForQuestionAnswering": {
-        "tokenizer_classes": [
-            "SqueezeBertTokenizer",
-            "SqueezeBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SqueezeBertForQuestionAnswering"
-        ],
-        "sha": "bceb045a9ac6eb2ded7d358ed577c6dc28ea487a"
-    },
-    "SqueezeBertForSequenceClassification": {
-        "tokenizer_classes": [
-            "SqueezeBertTokenizer",
-            "SqueezeBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SqueezeBertForSequenceClassification"
-        ],
-        "sha": "c5aeb1f454a1d059d41a5f8dacaf784b9de0b899"
-    },
-    "SqueezeBertForTokenClassification": {
-        "tokenizer_classes": [
-            "SqueezeBertTokenizer",
-            "SqueezeBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SqueezeBertForTokenClassification"
-        ],
-        "sha": "70ba60ca44a380e6aa983a37b163c57217219df7"
-    },
-    "SqueezeBertModel": {
-        "tokenizer_classes": [
-            "SqueezeBertTokenizer",
-            "SqueezeBertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SqueezeBertModel"
-        ],
-        "sha": "e0a3ac56a4047da3f921638252ead5e44438bbdb"
-    },
-    "SwiftFormerForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "SwiftFormerForImageClassification"
-        ],
-        "sha": "a249b14a525d29e675b6e4af4baacd9ba7df7598"
-    },
-    "SwiftFormerModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "SwiftFormerModel"
-        ],
-        "sha": "25ba2d88c770533f8c69811d2a454a00c1d09f5d"
-    },
-    "Swin2SRModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "Swin2SRImageProcessor"
-        ],
-        "model_classes": [
-            "Swin2SRModel"
-        ],
-        "sha": "c67f6ecff9ef8675c3869c987277b0a1e040f4be"
-    },
-    "SwinBackbone": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "SwinBackbone"
-        ],
-        "sha": "89b28b8ec05a7b3357be75a77eb7809e6fd5cfef"
-    },
-    "SwinForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "SwinForImageClassification",
-            "TFSwinForImageClassification"
-        ],
-        "sha": "e3c2e80f380ef79781313981da1a993dd8b8d34d"
-    },
-    "SwinForMaskedImageModeling": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "SwinForMaskedImageModeling",
-            "TFSwinForMaskedImageModeling"
-        ],
-        "sha": "d84b061fbace1bc6e697e3253e222de42053f978"
-    },
-    "SwinModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "SwinModel",
-            "TFSwinModel"
-        ],
-        "sha": "23ff641295660ec4fea399be8aa1bc14565961f8"
-    },
-    "Swinv2ForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "Swinv2ForImageClassification"
-        ],
-        "sha": "3fd755cdf4cf611db83f72f9c9b00eb9257a38ca"
-    },
-    "Swinv2ForMaskedImageModeling": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "Swinv2ForMaskedImageModeling"
-        ],
-        "sha": "8375c31eb6231fde36ec6533a34ba5b28e296163"
-    },
-    "Swinv2Model": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "Swinv2Model"
-        ],
-        "sha": "70aeb72e8a266f668c8b51a517ec01003b8d6804"
-    },
-    "SwitchTransformersForConditionalGeneration": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SwitchTransformersForConditionalGeneration"
-        ],
-        "sha": "c8fcd2bb735894c78db7f1e5b51afc78aced7adb"
-    },
-    "SwitchTransformersModel": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "SwitchTransformersModel"
-        ],
-        "sha": "275bbf6d389bfd0540b9f824c609c6b22a577328"
-    },
-    "T5EncoderModel": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "T5EncoderModel",
-            "TFT5EncoderModel"
-        ],
-        "sha": "1c75090036a2b3740dfe2d570b889332ad8e59e8"
-    },
-    "T5ForConditionalGeneration": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "T5ForConditionalGeneration",
-            "TFT5ForConditionalGeneration"
-        ],
-        "sha": "593fd6072a4e265f5cc73b1973cd8af76b261f29"
-    },
-    "T5ForQuestionAnswering": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "T5ForQuestionAnswering"
-        ],
-        "sha": "b9edf2de494244ff032f67d2d7bdf6c591000c94"
-    },
-    "T5ForSequenceClassification": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "T5ForSequenceClassification"
-        ],
-        "sha": "105b5c4c8e1efe927444108f1388c4f102ebad15"
-    },
-    "T5Model": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "T5Model",
-            "TFT5Model"
-        ],
-        "sha": "eb3d20dda0ba77c1de618d78116a1a0c784c515c"
-    },
-    "TableTransformerForObjectDetection": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DetrImageProcessor"
-        ],
-        "model_classes": [
-            "TableTransformerForObjectDetection"
-        ],
-        "sha": "9cf1e3f5c3555a727672a32b49f8b96c5aa20be6"
-    },
-    "TableTransformerModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "DetrImageProcessor"
-        ],
-        "model_classes": [
-            "TableTransformerModel"
-        ],
-        "sha": "7b446244d8739b0c29d98f7d537b15ad578577d5"
-    },
-    "TapasForMaskedLM": {
-        "tokenizer_classes": [
-            "TapasTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFTapasForMaskedLM",
-            "TapasForMaskedLM"
-        ],
-        "sha": "2cedb92dd9a3dc37ffb7d35ad5190b110992577c"
-    },
-    "TapasForQuestionAnswering": {
-        "tokenizer_classes": [
-            "TapasTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFTapasForQuestionAnswering",
-            "TapasForQuestionAnswering"
-        ],
-        "sha": "4cc91b9e5db662e6e392d8052587ae419896d72b"
-    },
-    "TapasForSequenceClassification": {
-        "tokenizer_classes": [
-            "TapasTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFTapasForSequenceClassification",
-            "TapasForSequenceClassification"
-        ],
-        "sha": "7c37bfb87a6fce2f8604bb3cab2a14e09a285e14"
-    },
-    "TapasModel": {
-        "tokenizer_classes": [
-            "TapasTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFTapasModel",
-            "TapasModel"
-        ],
-        "sha": "bc004af0a415afe1f566c3afe8dd4d48d08c1ce0"
-    },
-    "TimesformerForVideoClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "VideoMAEImageProcessor"
-        ],
-        "model_classes": [
-            "TimesformerForVideoClassification"
-        ],
-        "sha": "0b3b8e314618d7af34fb44477745491b44bf556d"
-    },
-    "TimesformerModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "VideoMAEImageProcessor"
-        ],
-        "model_classes": [
-            "TimesformerModel"
-        ],
-        "sha": "ea51f7ebb6426ad2b1fa1396e83f8e8ad5bc3b44"
-    },
-    "TransfoXLForSequenceClassification": {
-        "tokenizer_classes": [
-            "TransfoXLTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFTransfoXLForSequenceClassification",
-            "TransfoXLForSequenceClassification"
-        ],
-        "sha": "f3d370184350667d74056b979081b0bf5b0083c1"
-    },
-    "TransfoXLLMHeadModel": {
-        "tokenizer_classes": [
-            "TransfoXLTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFTransfoXLLMHeadModel",
-            "TransfoXLLMHeadModel"
-        ],
-        "sha": "e0d4cebcdde52d8d4c81782a1edc606830bd6afd"
-    },
-    "TransfoXLModel": {
-        "tokenizer_classes": [
-            "TransfoXLTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFTransfoXLModel",
-            "TransfoXLModel"
-        ],
-        "sha": "6938eeae35662a862accb01412dfc486454bdc8f"
-    },
-    "TvltForPreTraining": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "TvltProcessor"
-        ],
-        "model_classes": [
-            "TvltForPreTraining"
-        ],
-        "sha": "f7bd2833764eb6d55a921aaed81d3f21119016ae"
-    },
-    "TvltModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "TvltProcessor"
-        ],
-        "model_classes": [
-            "TvltModel"
-        ],
-        "sha": "c3cbf7a6159c038f333ce7adda2480ea3396b2b3"
-    },
-    "UMT5EncoderModel": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "UMT5EncoderModel"
-        ],
-        "sha": "2894e49c9fbd17ea4b3dab56ec388be354c1a5f0"
-    },
-    "UMT5ForQuestionAnswering": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "UMT5ForQuestionAnswering"
-        ],
-        "sha": "b381aa068a44200db539f2f48f4e34a5ed1cb093"
-    },
-    "UMT5ForSequenceClassification": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "UMT5ForSequenceClassification"
-        ],
-        "sha": "aa9f77b7b3cff21425b7512e7c0f478af7b5db14"
-    },
-    "UMT5Model": {
-        "tokenizer_classes": [
-            "T5Tokenizer",
-            "T5TokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "UMT5Model"
-        ],
-        "sha": "9180d850b24e5494442a4f7a8ca1a4c102f9babd"
-    },
-    "UniSpeechForCTC": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechForCTC"
-        ],
-        "sha": "102b56d76f4d74cface309801c0ad80892583751"
-    },
-    "UniSpeechForPreTraining": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechForPreTraining"
-        ],
-        "sha": "830be5b3e85aaae7bcc961218e417c29743d6042"
-    },
-    "UniSpeechForSequenceClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechForSequenceClassification"
-        ],
-        "sha": "a30ac1516944757ccd8efcbcf94033a03f8708bf"
-    },
-    "UniSpeechModel": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechModel"
-        ],
-        "sha": "18e170eb1091715b74ace28c8c380b6bf2b6202d"
-    },
-    "UniSpeechSatForAudioFrameClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechSatForAudioFrameClassification"
-        ],
-        "sha": "7eba5a1c6cd610928b27ecb217bb17c729a07a57"
-    },
-    "UniSpeechSatForCTC": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechSatForCTC"
-        ],
-        "sha": "a8617538d3a2ae990f022bb0c36b8428a4870822"
-    },
-    "UniSpeechSatForPreTraining": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechSatForPreTraining"
-        ],
-        "sha": "a772f66db0ab49e1050e524d7fcbe5106ebdaf96"
-    },
-    "UniSpeechSatForSequenceClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechSatForSequenceClassification"
-        ],
-        "sha": "f1c16567bd829a6d8a7a2d167d22e9653149e625"
-    },
-    "UniSpeechSatForXVector": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechSatForXVector"
-        ],
-        "sha": "71cb3780cf3678f74fba00e19df82df76dca6133"
-    },
-    "UniSpeechSatModel": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "UniSpeechSatModel"
-        ],
-        "sha": "ea755bbc7c6c6aa649c58b4b000f243acbbd6b5a"
-    },
-    "UperNetForSemanticSegmentation": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "SegformerImageProcessor"
-        ],
-        "model_classes": [
-            "UperNetForSemanticSegmentation"
-        ],
-        "sha": "f1871cb388bc0b203f5397bfc06a373736c2fb9c"
-    },
-    "VanForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "VanForImageClassification"
-        ],
-        "sha": "694eb147bc4768aeabeffbfb97732281b71a621d"
-    },
-    "VanModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ConvNextImageProcessor"
-        ],
-        "model_classes": [
-            "VanModel"
-        ],
-        "sha": "d8ac60ce952020f2b0355fc566d634b2c5ba635d"
-    },
-    "ViTForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "TFViTForImageClassification",
-            "ViTForImageClassification"
-        ],
-        "sha": "5b3b44a3ed492070c273e481e30ecf4deddc5ec3"
-    },
-    "ViTForMaskedImageModeling": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "ViTForMaskedImageModeling"
-        ],
-        "sha": "d984e0b432fe195c2c26952d4f249031e7b1e2ea"
-    },
-    "ViTHybridForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTHybridImageProcessor"
-        ],
-        "model_classes": [
-            "ViTHybridForImageClassification"
-        ],
-        "sha": "69c7c396032ffe60d54953b584394899fb95ccc1"
-    },
-    "ViTHybridModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTHybridImageProcessor"
-        ],
-        "model_classes": [
-            "ViTHybridModel"
-        ],
-        "sha": "077443bfefe40d625314dbd274d2ff8089624797"
-    },
-    "ViTMAEForPreTraining": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "TFViTMAEForPreTraining",
-            "ViTMAEForPreTraining"
-        ],
-        "sha": "2d98d80d9c45eef0d5b6f5426d7196bb546fe9fc"
-    },
-    "ViTMAEModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "TFViTMAEModel",
-            "ViTMAEModel"
-        ],
-        "sha": "c7c2f12c19d2dbec08851a9dac7485909629a5fd"
-    },
-    "ViTMSNForImageClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "ViTMSNForImageClassification"
-        ],
-        "sha": "feda819aa7dbb55d850130f4cf1d210858d7eb89"
-    },
-    "ViTMSNModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "ViTMSNModel"
-        ],
-        "sha": "0733abf168cb47a149821fdd2113d546e15c47de"
-    },
-    "ViTModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "TFViTModel",
-            "ViTModel"
-        ],
-        "sha": "31817b7a64ebc3333fcd4801dfbb356ab07b13dd"
-    },
-    "VideoMAEForPreTraining": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "VideoMAEImageProcessor"
-        ],
-        "model_classes": [
-            "VideoMAEForPreTraining"
-        ],
-        "sha": "9de66c4bb759dc7269a7af17bf70b3194550acaa"
-    },
-    "VideoMAEForVideoClassification": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "VideoMAEImageProcessor"
-        ],
-        "model_classes": [
-            "VideoMAEForVideoClassification"
-        ],
-        "sha": "d3f743408386bc0ffe2d979de35335e87bc34aec"
-    },
-    "VideoMAEModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "VideoMAEImageProcessor"
-        ],
-        "model_classes": [
-            "VideoMAEModel"
-        ],
-        "sha": "a2be96beba888817d92b67525601569d830342ff"
-    },
-    "ViltForQuestionAnswering": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "ViltImageProcessor"
-        ],
-        "model_classes": [
-            "ViltForQuestionAnswering"
-        ],
-        "sha": "faeffbf43da6621717d8b13e7ebe87d58d750cb2"
-    },
-    "ViltModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "ViltImageProcessor"
-        ],
-        "model_classes": [
-            "ViltModel"
-        ],
-        "sha": "3a89b7b5782947c4f4125162ffe1c9cc18c9c800"
-    },
-    "VisionEncoderDecoderModel": {
-        "tokenizer_classes": [
-            "GPT2Tokenizer",
-            "GPT2TokenizerFast"
-        ],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "TFVisionEncoderDecoderModel",
-            "VisionEncoderDecoderModel"
-        ],
-        "sha": "23917761070cf16b26a6d033b6bff9100bbc618b"
-    },
-    "VisionTextDualEncoderModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [
-            "ViTImageProcessor"
-        ],
-        "model_classes": [
-            "TFVisionTextDualEncoderModel",
-            "VisionTextDualEncoderModel"
-        ],
-        "sha": "c3569ef17f66acbacb76f7ceb6f71e02d075dd6c"
-    },
-    "VisualBertForPreTraining": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "VisualBertForPreTraining"
-        ],
-        "sha": "ce5a4d93ce762971cd216cda9aef8b9ce3f0450b"
-    },
-    "VisualBertModel": {
-        "tokenizer_classes": [
-            "BertTokenizer",
-            "BertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "VisualBertModel"
-        ],
-        "sha": "85020189fb7bf1217eb9370b09bca8ec5bcfdafa"
-    },
-    "Wav2Vec2ConformerForAudioFrameClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ConformerForAudioFrameClassification"
-        ],
-        "sha": "e316a18a1d165b4cb51a7f28f8e8dab676da4b56"
-    },
-    "Wav2Vec2ConformerForCTC": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ConformerForCTC"
-        ],
-        "sha": "a2ecb2985fcbb9f3ed000c12c1af6da36f5eaa3a"
-    },
-    "Wav2Vec2ConformerForPreTraining": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ConformerForPreTraining"
-        ],
-        "sha": "099279b69e5da19efb05589804ccee210a0e57ae"
-    },
-    "Wav2Vec2ConformerForSequenceClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ConformerForSequenceClassification"
-        ],
-        "sha": "e8c1bca543c54bf15a6c026cb3761993b52cf617"
-    },
-    "Wav2Vec2ConformerForXVector": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ConformerForXVector"
-        ],
-        "sha": "ba206a55998f16e134960728bd02006eaf39114f"
-    },
-    "Wav2Vec2ConformerModel": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ConformerModel"
-        ],
-        "sha": "ef2fe3aa8c23e6f8696e6612061aaddecae49994"
-    },
-    "Wav2Vec2ForAudioFrameClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ForAudioFrameClassification"
-        ],
-        "sha": "ab219f119e10f56e1059966c66d23f0df3c2c343"
-    },
-    "Wav2Vec2ForCTC": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ForCTC"
-        ],
-        "sha": "6245fbb1cb99cea5c4de1e73f81fba978fb275ac"
-    },
-    "Wav2Vec2ForMaskedLM": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ForMaskedLM"
-        ],
-        "sha": "e083cf4fefec4df3c241dbbe5e17a84a794a89bd"
-    },
-    "Wav2Vec2ForPreTraining": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ForPreTraining"
-        ],
-        "sha": "a8d71e216334260353ccbf5ce84cd6924f7457da"
-    },
-    "Wav2Vec2ForSequenceClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "TFWav2Vec2ForSequenceClassification",
-            "Wav2Vec2ForSequenceClassification"
-        ],
-        "sha": "2000b2022abcc37100241485f5872126b70164c9"
-    },
-    "Wav2Vec2ForXVector": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "Wav2Vec2ForXVector"
-        ],
-        "sha": "f4c422db53aae061ea609f4407af7cd5b33c8942"
-    },
-    "Wav2Vec2Model": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "TFWav2Vec2Model",
-            "Wav2Vec2Model"
-        ],
-        "sha": "7a998ee3ee0619a52828a79c3eed6872fd053f37"
-    },
-    "WavLMForAudioFrameClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "WavLMForAudioFrameClassification"
-        ],
-        "sha": "b135610f8d5de0b1a5bf5ed7212966135c63d6ec"
-    },
-    "WavLMForCTC": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "WavLMForCTC"
-        ],
-        "sha": "f1139c5ddf34d2327ae1f6917edd7da180b06971"
-    },
-    "WavLMForSequenceClassification": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "WavLMForSequenceClassification"
-        ],
-        "sha": "4ba5f2019b46866ce2011c993194ebda60afc028"
-    },
-    "WavLMForXVector": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "WavLMForXVector"
-        ],
-        "sha": "faf9264eac56a56d5510a0984d7e1146e4c8cf62"
-    },
-    "WavLMModel": {
-        "tokenizer_classes": [
-            "Wav2Vec2CTCTokenizer"
-        ],
-        "processor_classes": [
-            "Wav2Vec2FeatureExtractor"
-        ],
-        "model_classes": [
-            "WavLMModel"
-        ],
-        "sha": "e932275e37cb643be271f655bd1d649f4f4b4bd5"
-    },
-    "WhisperForAudioClassification": {
-        "tokenizer_classes": [
-            "WhisperTokenizer"
-        ],
-        "processor_classes": [
-            "WhisperFeatureExtractor"
-        ],
-        "model_classes": [
-            "WhisperForAudioClassification"
-        ],
-        "sha": "d71b13674b1a67443cd19d0594a3b5b1e5968f0d"
-    },
-    "WhisperForConditionalGeneration": {
-        "tokenizer_classes": [
-            "WhisperTokenizer",
-            "WhisperTokenizerFast"
-        ],
-        "processor_classes": [
-            "WhisperFeatureExtractor"
-        ],
-        "model_classes": [
-            "TFWhisperForConditionalGeneration",
-            "WhisperForConditionalGeneration"
-        ],
-        "sha": "598101b885b24508042d9292e54aa04bff96318e"
-    },
-    "WhisperModel": {
-        "tokenizer_classes": [
-            "WhisperTokenizer",
-            "WhisperTokenizerFast"
-        ],
-        "processor_classes": [
-            "WhisperFeatureExtractor"
-        ],
-        "model_classes": [
-            "TFWhisperModel",
-            "WhisperModel"
-        ],
-        "sha": "c04c50216bb6b0a8f4d55f2fa9f9f4cf61c8a77c"
-    },
-    "XCLIPModel": {
-        "tokenizer_classes": [
-            "CLIPTokenizer",
-            "CLIPTokenizerFast"
-        ],
-        "processor_classes": [
-            "VideoMAEImageProcessor"
-        ],
-        "model_classes": [
-            "XCLIPModel"
-        ],
-        "sha": "299ffffc6b94c3558bf7dbc38e24074c99490046"
-    },
-    "XGLMForCausalLM": {
-        "tokenizer_classes": [
-            "XGLMTokenizer",
-            "XGLMTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXGLMForCausalLM",
-            "XGLMForCausalLM"
-        ],
-        "sha": "d5381ce297c249d559937c6bb6316cf1fdad2613"
-    },
-    "XGLMModel": {
-        "tokenizer_classes": [
-            "XGLMTokenizer",
-            "XGLMTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXGLMModel",
-            "XGLMModel"
-        ],
-        "sha": "2b5cef167822cfaa558d259af1722e2f785cd3d5"
-    },
-    "XLMForMultipleChoice": {
-        "tokenizer_classes": [
-            "XLMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLMForMultipleChoice",
-            "XLMForMultipleChoice"
-        ],
-        "sha": "f0c8cc6462449ac9eb9b4158e433bd3c923db3af"
-    },
-    "XLMForQuestionAnsweringSimple": {
-        "tokenizer_classes": [
-            "XLMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLMForQuestionAnsweringSimple",
-            "XLMForQuestionAnsweringSimple"
-        ],
-        "sha": "82e93a2653cf3646eaaf02d8cc5f8ff9a4551523"
-    },
-    "XLMForSequenceClassification": {
-        "tokenizer_classes": [
-            "XLMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLMForSequenceClassification",
-            "XLMForSequenceClassification"
-        ],
-        "sha": "2d6892f5f703be9b481bca91477032bd0e36dbe5"
-    },
-    "XLMForTokenClassification": {
-        "tokenizer_classes": [
-            "XLMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLMForTokenClassification",
-            "XLMForTokenClassification"
-        ],
-        "sha": "9a591395e7a0643a03f5d2debb98caa3966e021c"
-    },
-    "XLMModel": {
-        "tokenizer_classes": [
-            "XLMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLMModel",
-            "XLMModel"
-        ],
-        "sha": "022b86df246414ff712475d9ca55db690ff1d3bf"
-    },
-    "XLMRobertaXLForCausalLM": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XLMRobertaXLForCausalLM"
-        ],
-        "sha": "fc05408e5b33a31638476ef337719dfbb7615ef3"
-    },
-    "XLMRobertaXLForMaskedLM": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XLMRobertaXLForMaskedLM"
-        ],
-        "sha": "e96f198eede757e5ae2c87632fdcfb341073ef6e"
-    },
-    "XLMRobertaXLForMultipleChoice": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XLMRobertaXLForMultipleChoice"
-        ],
-        "sha": "52732625f1bfbbb7cb4ba1cf0963de596d81822d"
-    },
-    "XLMRobertaXLForQuestionAnswering": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XLMRobertaXLForQuestionAnswering"
-        ],
-        "sha": "da388fdd2d28e0757eb0c2b2c612a8ff03af2223"
-    },
-    "XLMRobertaXLForSequenceClassification": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XLMRobertaXLForSequenceClassification"
-        ],
-        "sha": "980721187633bcf21ac0b8edbed933527f4611df"
-    },
-    "XLMRobertaXLForTokenClassification": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XLMRobertaXLForTokenClassification"
-        ],
-        "sha": "37a97280faf6fef0bd946d3934d77a1b60fbf473"
-    },
-    "XLMRobertaXLModel": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XLMRobertaXLModel"
-        ],
-        "sha": "8fbeb39a984912e47f5d24a31be61639031a0fc3"
-    },
-    "XLMWithLMHeadModel": {
-        "tokenizer_classes": [
-            "XLMTokenizer"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLMWithLMHeadModel",
-            "XLMWithLMHeadModel"
-        ],
-        "sha": "db70bdefbaf095e88b8097e4b601d9105a511afa"
-    },
-    "XLNetForMultipleChoice": {
-        "tokenizer_classes": [
-            "XLNetTokenizer",
-            "XLNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLNetForMultipleChoice",
-            "XLNetForMultipleChoice"
-        ],
-        "sha": "8bb7e28d0cd1e93154d3232baf5e9c79acaf9f1a"
-    },
-    "XLNetForQuestionAnsweringSimple": {
-        "tokenizer_classes": [
-            "XLNetTokenizer",
-            "XLNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLNetForQuestionAnsweringSimple",
-            "XLNetForQuestionAnsweringSimple"
-        ],
-        "sha": "fabd06a45d947f3d46f1b8dce2186cf3b27776dc"
-    },
-    "XLNetForSequenceClassification": {
-        "tokenizer_classes": [
-            "XLNetTokenizer",
-            "XLNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLNetForSequenceClassification",
-            "XLNetForSequenceClassification"
-        ],
-        "sha": "e3c194f24537ebf2c474ade60becb9397696edec"
-    },
-    "XLNetForTokenClassification": {
-        "tokenizer_classes": [
-            "XLNetTokenizer",
-            "XLNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLNetForTokenClassification",
-            "XLNetForTokenClassification"
-        ],
-        "sha": "16aa15029aa667046d504c4a88ceddfdd5b5fb40"
-    },
-    "XLNetLMHeadModel": {
-        "tokenizer_classes": [
-            "XLNetTokenizer",
-            "XLNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLNetLMHeadModel",
-            "XLNetLMHeadModel"
-        ],
-        "sha": "c9a98cc982a16ca162832a8cbea25116479bb938"
-    },
-    "XLNetModel": {
-        "tokenizer_classes": [
-            "XLNetTokenizer",
-            "XLNetTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "TFXLNetModel",
-            "XLNetModel"
-        ],
-        "sha": "1d6e231942135faf32b8d9a97773d8f6c85ca561"
-    },
-    "XmodForCausalLM": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XmodForCausalLM"
-        ],
-        "sha": "c6b746071f2f067099a8fb4f57ce3c27a7e4b67d"
-    },
-    "XmodForMaskedLM": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XmodForMaskedLM"
-        ],
-        "sha": "e1085818f4ed3c6073b2038635e5f3061208923d"
-    },
-    "XmodForMultipleChoice": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XmodForMultipleChoice"
-        ],
-        "sha": "c63042cdf196be3fed846421b345d439b2483f69"
-    },
-    "XmodForQuestionAnswering": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XmodForQuestionAnswering"
-        ],
-        "sha": "75acd3071fae9978c82618cd0f090c87aabc1f23"
-    },
-    "XmodForSequenceClassification": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XmodForSequenceClassification"
-        ],
-        "sha": "523a16570be048618913ac17ccd00d343bcb5e99"
-    },
-    "XmodForTokenClassification": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XmodForTokenClassification"
-        ],
-        "sha": "a0f0a02732b4579670dad11a69ae244ebd777b49"
-    },
-    "XmodModel": {
-        "tokenizer_classes": [
-            "XLMRobertaTokenizer",
-            "XLMRobertaTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "XmodModel"
-        ],
-        "sha": "bc286de0035450e7dcd6bcce78098a967b9c2b6c"
-    },
-    "YolosForObjectDetection": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "YolosImageProcessor"
-        ],
-        "model_classes": [
-            "YolosForObjectDetection"
-        ],
-        "sha": "0a4aae25bfbe8b5edd4815cb00d697a6ba7d2126"
-    },
-    "YolosModel": {
-        "tokenizer_classes": [],
-        "processor_classes": [
-            "YolosImageProcessor"
-        ],
-        "model_classes": [
-            "YolosModel"
-        ],
-        "sha": "339bc51f1914f031a550e5f95095ed4a4c22a7de"
-    },
-    "YosoForMaskedLM": {
-        "tokenizer_classes": [
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "YosoForMaskedLM"
-        ],
-        "sha": "cb291bedcbec199ea195f086e3ebea6fab026bba"
-    },
-    "YosoForMultipleChoice": {
-        "tokenizer_classes": [
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "YosoForMultipleChoice"
-        ],
-        "sha": "cf2d3a3f0628bc9d0da68ea8de26b12016453fee"
-    },
-    "YosoForQuestionAnswering": {
-        "tokenizer_classes": [
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "YosoForQuestionAnswering"
-        ],
-        "sha": "e8c3091f674588adfa3371b3de0427a9b39dd03f"
-    },
-    "YosoForSequenceClassification": {
-        "tokenizer_classes": [
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "YosoForSequenceClassification"
-        ],
-        "sha": "88132cbaa1a9a87f65b6f9813c388011377f18cf"
-    },
-    "YosoForTokenClassification": {
-        "tokenizer_classes": [
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "YosoForTokenClassification"
-        ],
-        "sha": "fd2219856608d3dba70dc7b1a06af629903dec31"
-    },
-    "YosoModel": {
-        "tokenizer_classes": [
-            "AlbertTokenizerFast"
-        ],
-        "processor_classes": [],
-        "model_classes": [
-            "YosoModel"
-        ],
-        "sha": "e144d9f1fe39c21eda1177702640e126892605ce"
-    }
-}
\ No newline at end of file
diff --git a/tests/transformers/utils/test_module/__init__.py b/tests/transformers/utils/test_module/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/transformers/utils/test_module/custom_configuration.py b/tests/transformers/utils/test_module/custom_configuration.py
deleted file mode 100644
index 676486fc51..0000000000
--- a/tests/transformers/utils/test_module/custom_configuration.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from transformers import PretrainedConfig
-
-
-class CustomConfig(PretrainedConfig):
-    model_type = "custom"
-
-    def __init__(self, attribute=1, **kwargs):
-        self.attribute = attribute
-        super().__init__(**kwargs)
-
-
-class NoSuperInitConfig(PretrainedConfig):
-    model_type = "custom"
-
-    def __init__(self, attribute=1, **kwargs):
-        self.attribute = attribute
diff --git a/tests/transformers/utils/test_module/custom_feature_extraction.py b/tests/transformers/utils/test_module/custom_feature_extraction.py
deleted file mode 100644
index de367032d8..0000000000
--- a/tests/transformers/utils/test_module/custom_feature_extraction.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from transformers import Wav2Vec2FeatureExtractor
-
-
-class CustomFeatureExtractor(Wav2Vec2FeatureExtractor):
-    pass
diff --git a/tests/transformers/utils/test_module/custom_image_processing.py b/tests/transformers/utils/test_module/custom_image_processing.py
deleted file mode 100644
index e4984854ad..0000000000
--- a/tests/transformers/utils/test_module/custom_image_processing.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from transformers import CLIPImageProcessor
-
-
-class CustomImageProcessor(CLIPImageProcessor):
-    pass
diff --git a/tests/transformers/utils/test_module/custom_modeling.py b/tests/transformers/utils/test_module/custom_modeling.py
deleted file mode 100644
index 4c9e13d467..0000000000
--- a/tests/transformers/utils/test_module/custom_modeling.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import torch
-from transformers import PreTrainedModel
-
-from .custom_configuration import CustomConfig, NoSuperInitConfig
-
-
-class CustomModel(PreTrainedModel):
-    config_class = CustomConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.linear = torch.nn.Linear(config.hidden_size, config.hidden_size)
-
-    def forward(self, x):
-        return self.linear(x)
-
-    def _init_weights(self, module):
-        pass
-
-
-class NoSuperInitModel(PreTrainedModel):
-    config_class = NoSuperInitConfig
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.linear = torch.nn.Linear(config.attribute, config.attribute)
-
-    def forward(self, x):
-        return self.linear(x)
-
-    def _init_weights(self, module):
-        pass
diff --git a/tests/transformers/utils/test_module/custom_pipeline.py b/tests/transformers/utils/test_module/custom_pipeline.py
deleted file mode 100644
index 4172bd0c66..0000000000
--- a/tests/transformers/utils/test_module/custom_pipeline.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import numpy as np
-from transformers import Pipeline
-
-
-def softmax(outputs):
-    maxes = np.max(outputs, axis=-1, keepdims=True)
-    shifted_exp = np.exp(outputs - maxes)
-    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
-
-
-class PairClassificationPipeline(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "second_text" in kwargs:
-            preprocess_kwargs["second_text"] = kwargs["second_text"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, text, second_text=None):
-        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
-
-    def _forward(self, model_inputs):
-        return self.model(**model_inputs)
-
-    def postprocess(self, model_outputs):
-        logits = model_outputs.logits[0].numpy()
-        probabilities = softmax(logits)
-
-        best_class = np.argmax(probabilities)
-        label = self.model.config.id2label[best_class]
-        score = probabilities[best_class].item()
-        logits = logits.tolist()
-        return {"label": label, "score": score, "logits": logits}
diff --git a/tests/transformers/utils/test_module/custom_processing.py b/tests/transformers/utils/test_module/custom_processing.py
deleted file mode 100644
index 196fc511b6..0000000000
--- a/tests/transformers/utils/test_module/custom_processing.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from transformers import ProcessorMixin
-
-
-class CustomProcessor(ProcessorMixin):
-    feature_extractor_class = "AutoFeatureExtractor"
-    tokenizer_class = "AutoTokenizer"
diff --git a/tests/transformers/utils/test_module/custom_tokenization.py b/tests/transformers/utils/test_module/custom_tokenization.py
deleted file mode 100644
index d67b137304..0000000000
--- a/tests/transformers/utils/test_module/custom_tokenization.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from transformers import BertTokenizer
-
-
-class CustomTokenizer(BertTokenizer):
-    pass
diff --git a/tests/transformers/utils/test_module/custom_tokenization_fast.py b/tests/transformers/utils/test_module/custom_tokenization_fast.py
deleted file mode 100644
index ace94fdd1a..0000000000
--- a/tests/transformers/utils/test_module/custom_tokenization_fast.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from transformers import BertTokenizerFast
-
-from .custom_tokenization import CustomTokenizer
-
-
-class CustomTokenizerFast(BertTokenizerFast):
-    slow_tokenizer_class = CustomTokenizer
-    pass
diff --git a/tests/utils.py b/tests/utils.py
deleted file mode 100644
index cce23476e5..0000000000
--- a/tests/utils.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Mapping between model families and specific model names with their configuration
-MODELS_TO_TEST_MAPPING = {
-    "audio-spectrogram-transformer": [
-        ("MIT/ast-finetuned-speech-commands-v2", "Habana/wav2vec2"),
-    ],
-    "bert": [
-        # ("bert-base-uncased", "Habana/bert-base-uncased"),
-        ("bert-large-uncased-whole-word-masking", "Habana/bert-large-uncased-whole-word-masking"),
-    ],
-    "roberta": [
-        ("roberta-base", "Habana/roberta-base"),
-        ("roberta-large", "Habana/roberta-large"),
-    ],
-    "albert": [
-        ("albert-large-v2", "Habana/albert-large-v2"),
-        ("albert-xxlarge-v1", "Habana/albert-xxlarge-v1"),
-    ],
-    "distilbert": [
-        ("distilbert-base-uncased", "Habana/distilbert-base-uncased"),
-    ],
-    "gpt2": [
-        ("gpt2", "Habana/gpt2"),
-        ("gpt2-xl", "Habana/gpt2"),
-    ],
-    "t5": [
-        ("t5-small", "Habana/t5"),
-        ("google/flan-t5-xxl", "Habana/t5"),
-    ],
-    "vit": [
-        ("google/vit-base-patch16-224-in21k", "Habana/vit"),
-    ],
-    "wav2vec2": [
-        ("facebook/wav2vec2-base", "Habana/wav2vec2"),
-        ("facebook/wav2vec2-large-lv60", "Habana/wav2vec2"),
-    ],
-    "swin": [("microsoft/swin-base-patch4-window7-224-in22k", "Habana/swin")],
-    "clip": [("./clip-roberta", "Habana/clip")],
-    "bridgetower": [("BridgeTower/bridgetower-large-itm-mlm-itc", "Habana/clip")],
-    "gpt_neox": [("EleutherAI/gpt-neox-20b", "Habana/gpt2")],
-    "llama": [("huggyllama/llama-7b", "Habana/llama")],
-    "falcon": [("tiiuae/falcon-40b", "Habana/falcon")],
-    "bloom": [("bigscience/bloom-7b1", "Habana/roberta-base")],
-    "whisper": [("openai/whisper-small", "Habana/whisper")],
-    "llama_guard": [("meta-llama/LlamaGuard-7b", "Habana/llama")],
-    "code_llama": [("codellama/CodeLlama-13b-Instruct-hf", "Habana/llama")],
-    "protst": [("mila-intel/protst-esm1b-for-sequential-classification", "Habana/gpt2")],
-}
-
-MODELS_TO_TEST_FOR_QUESTION_ANSWERING = [
-    "bert",
-    "roberta",
-    "albert",
-    "distilbert",
-]
-
-# Only BERT has been officially validated for sequence classification
-MODELS_TO_TEST_FOR_SEQUENCE_CLASSIFICATION = [
-    "bert",
-    "llama_guard",
-    # "roberta",
-    # "albert",
-    # "distilbert",
-]
-
-MODELS_TO_TEST_FOR_CAUSAL_LANGUAGE_MODELING = ["gpt2", "gpt_neox", "bloom", "code_llama"]
-
-MODELS_TO_TEST_FOR_SEQ2SEQ = ["t5"]
-
-MODELS_TO_TEST_FOR_IMAGE_CLASSIFICATION = ["vit", "swin"]
-
-# Only RoBERTa is tested in CI for MLM
-MODELS_TO_TEST_FOR_MASKED_LANGUAGE_MODELING = [
-    # "bert",
-    "roberta",
-    # "albert",
-    # "distilbert",
-]
-
-MODELS_TO_TEST_FOR_AUDIO_CLASSIFICATION = ["wav2vec2", "audio-spectrogram-transformer"]
-
-MODELS_TO_TEST_FOR_SPEECH_RECOGNITION = ["wav2vec2", "whisper"]
-
-MODELS_TO_TEST_FOR_IMAGE_TEXT = ["clip"]
diff --git a/text-generation-inference/README.md b/text-generation-inference/README.md
deleted file mode 100644
index b7803fb3d6..0000000000
--- a/text-generation-inference/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-<!---
-Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Text Generation Inference on Intel® Gaudi® AI Accelerators
-
-Please refer to the following fork of TGI for deploying it on Habana Gaudi: https://github.com/huggingface/tgi-gaudi