Crawl cf.gov #1492
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Crawl cf.gov | |
# Triggers the workflow every day at 6AM UTC (1AM EST, 2AM EDT) | |
on: | |
schedule: | |
- cron: '0 6 * * *' | |
workflow_dispatch: | |
env: | |
DOMAIN: www.consumerfinance.gov | |
jobs: | |
build: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Remove previous crawl results | |
run: rm -r ${{ env.DOMAIN }} | |
- name: Run the crawl script | |
run: ./crawl.sh -d 0 https://${{ env.DOMAIN }} | |
- name: Simplify crawled HTML to prevent unimportant diffs | |
run: postprocessing/transform_results.sh ${{ env.DOMAIN }} | |
continue-on-error: true | |
- name: Prepare files for git commit | |
run: | | |
gzip -f *.log | |
git add *.log.gz | |
git add ${{ env.DOMAIN }} | |
- name: Prepare commit message | |
run: ./generate_summary.sh ${{ env.DOMAIN }} > commit.txt | |
- name: Commit crawl results back to GitHub | |
run: | | |
git config user.email [email protected] | |
git config user.name Automated | |
git commit -F commit.txt | |
git push | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |