.github/workflows/review.yml

# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: Diff review for markup

on:
  pull_request:
    branches: [ main ]

jobs:

  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  download_repos:

    runs-on: ubuntu-latest

    steps:

      - name: Checkout CredData
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event.pull_request.head.sha }}

      - name: Markup hashing
        run: |
          md5sum snapshot.yaml >head_checksums.md5
          for f in $(find meta -type f|sort); do md5sum $f; done >>head_checksums.md5
          for f in $(find . -maxdepth 1 -type f -name "*.py"|sort); do md5sum $f; done >>head_checksums.md5
          cat head_checksums.md5
          sha256sum head_checksums.md5

      - name: Cache head review
        id: cache-data
        uses: actions/cache@v4
        with:
          path: |
            review_head.txt
            review_head.html
          key: cred-data-${{ hashFiles('head_checksums.md5') }}

      - name: Cache tmp
        if: steps.cache-data.outputs.cache-hit != 'true'
        id: cache-tmp
        uses: actions/cache@v4
        with:
          path: tmp
          key: cred-data-${{ hashFiles('snapshot.yaml') }}

      - name: install ansi2html
        if: steps.cache-data.outputs.cache-hit != 'true'
        run: sudo apt update && sudo apt install colorized-logs

      - name: Set up Python 3.10
        if: steps.cache-data.outputs.cache-hit != 'true'
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"

      - name: Produce review report from HEAD
        if: steps.cache-data.outputs.cache-hit != 'true'
        run: |
          python -m pip install --upgrade pip
          python -m pip install --requirement requirements.txt
          # skip extra hints from git
          git config --global init.defaultBranch work
          python download_data.py --data_dir data --jobs $(nproc)
          python review_data.py &>review_head.txt
          ansi2html --style 'pre {font-family: monospace; font-size: large}' <review_head.txt  >review_head.html

      - name: Produce benchmark scores from empty report to check markup only
        if: steps.cache-data.outputs.cache-hit != 'true'
        run: |
          python -m benchmark --scanner credsweeper --load .ci/empty_report.json >benchmark.txt
          diff --unified=3 --ignore-all-space --ignore-blank-lines .ci/benchmark.txt benchmark.txt

      - name: Upload artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: review_head
          path: |
            review_head.txt
            review_head.html
            benchmark.txt

  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  review_base:

    needs: [ download_repos ]

    runs-on: ubuntu-latest

    steps:

      - name: Checkout CredData
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event.pull_request.base.sha }}

      - name: Markup hashing
        run: |
          md5sum snapshot.yaml >base_checksums.md5
          for f in $(find meta -type f|sort); do md5sum $f; done >>base_checksums.md5
          for f in $(find . -maxdepth 1 -type f -name "*.py"|sort); do md5sum $f; done >>base_checksums.md5
          cat base_checksums.md5
          sha256sum base_checksums.md5

      - name: Cache base review
        id: cache-data
        uses: actions/cache@v4
        with:
          path: |
            review_base.txt
            review_base.html
          key: cred-data-${{ hashFiles('base_checksums.md5') }}

      - name: Cache tmp
        if: steps.cache-data.outputs.cache-hit != 'true'
        id: cache-tmp
        uses: actions/cache@v4
        with:
          path: tmp
          key: cred-data-${{ hashFiles('snapshot.yaml') }}

      - name: install ansi2html
        if: steps.cache-data.outputs.cache-hit != 'true'
        run: sudo apt update && sudo apt install colorized-logs

      - name: Set up Python 3.10
        if: steps.cache-data.outputs.cache-hit != 'true'
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"

      - name: Produce review report from HEAD
        if: steps.cache-data.outputs.cache-hit != 'true'
        run: |
          python -m pip install --upgrade pip
          python -m pip install --requirement requirements.txt
          python download_data.py --data_dir data --jobs $(nproc)
          python review_data.py &>review_base.txt
          ansi2html --style 'pre {font-family: monospace; font-size: large}' <review_base.txt  >review_base.html

      - name: Upload artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: review_base
          path: |
            review_base.txt
            review_base.html

  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  diff_review:

    needs: [ download_repos, review_base ]

    runs-on: ubuntu-latest

    steps:

      - name: install ansi2html
        run: sudo apt update && sudo apt install colorized-logs

      - name: Download all workflow run artifacts
        uses: actions/download-artifact@v4

      - name: Get diff for review
        run: |
          # in case of difference - diff returns failure
          if ! diff --unified=1 --color=auto review_base/review_base.txt review_head/review_head.txt &>review_diff.txt; then
            cat review_diff.txt | ansi2html --style 'pre {font-family: monospace; font-size: large}' >review_diff.html
          else
            touch review_diff.html
          fi

      - name: Upload artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: review_diff
          path: |
            review_diff.txt
            review_diff.html

  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

  markup_report_test:

    runs-on: ubuntu-latest

    steps:

      - name: Checkout CredData
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event.pull_request.head.sha }}

      - name: Set up Python 3.10
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"

      - name: Report markup test
        run: |
          echo '[
                    {
                        "api_validation": "NOT_AVAILABLE",
                        "ml_validation": "VALIDATED_KEY",
                        "ml_probability": 0.5,
                        "rule": "URL Credentials",
                        "severity": "high",
                        "confidence": "moderate",
                        "line_data_list": [
                            {
                                "line": "1b17231a97529365248357043f038afb58be4699fc955cee78d5c8a2b037e89d",
                                "line_num": 4256,
                                "path": "data/00408ef6/src/eee97fd2.c",
                                "info": "",
                                "value": "5489cf96c232e0925f96692a65e3e7afe8e09430c280e71b808da17dee9791f8",
                                "value_start": 16,
                                "value_end": 24,
                                "variable": "5b21d8adfc17cb2d231c2ec96498c8076c4a1d1e5373fa7d8b15a9e63d64d8f2",
                                "variable_start": 5,
                                "variable_end": 11,
                                "entropy_validation": {
                                    "iterator": "BASE64_CHARS",
                                    "entropy": 3.0,
                                    "valid": false
                                }
                            }
                        ]
                    }
                ]' >report.json
          cp report.json report.bak
          python markup_report.py report.json
          if diff report.bak report.json; then
            echo "Something went wrong. The files must be different"
            exit 1
          else
            # the substring from markup must appear in "api_validation"
            grep 'eee97fd2,GitHub,00408ef6' report.json
          fi

      - name: Upload artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: reports
          path: |
            report.json
            report.bak

  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #