From d470510d87023ab5eef46cd31f6f8752915f2215 Mon Sep 17 00:00:00 2001 From: AkshayDevkate Date: Tue, 10 Dec 2024 13:21:00 +0100 Subject: [PATCH] fix(documentation_practices): contributing guideline and code of conduct --- .../check_contributing_conduct.py | 201 ++++++++---------- 1 file changed, 94 insertions(+), 107 deletions(-) diff --git a/collect_variables/scripts/soft_dev_pract/documentation_practices/check_contributing_conduct.py b/collect_variables/scripts/soft_dev_pract/documentation_practices/check_contributing_conduct.py index 93e8230..aac8949 100644 --- a/collect_variables/scripts/soft_dev_pract/documentation_practices/check_contributing_conduct.py +++ b/collect_variables/scripts/soft_dev_pract/documentation_practices/check_contributing_conduct.py @@ -1,42 +1,38 @@ -""" -This script checks if GitHub repositories have `CONTRIBUTING.md` and -`CODE_OF_CONDUCT.md` files. It processes repositories listed in a CSV -file, checks the presence of these files in the root, `.github/`, and -`docs/` directories. It uses the GitHub API and handles rate limiting -by pausing execution for 20 minutes (or until rate limit resets) if -the rate limit is reached. The results are saved to an output CSV file. - -Required environment variables: -- GITHUB_TOKEN: GitHub Personal Access Token with appropriate permissions. - -Usage: - python script_name.py --input --output -""" - import os import argparse import pandas as pd import time from ghapi.all import GhApi from dotenv import load_dotenv +from concurrent.futures import ThreadPoolExecutor, as_completed + + +def load_credentials(): + """ + Load GITHUB_USER and GITHUB_TOKEN from the .env file located relative to the script's directory. + """ + # Get the directory of the current script + script_dir = os.path.dirname(os.path.realpath(__file__)) + + # Construct the relative path to the .env file + env_path = os.path.join(script_dir, '..', '..', '..', '..', '.env') + + # Load the .env file + load_dotenv(dotenv_path=env_path, override=True) + + # Get the GITHUB_USER and GITHUB_TOKEN from the .env file + user = os.getenv('GITHUB_USER') + token = os.getenv('GITHUB_TOKEN') + + if not user or not token: + raise ValueError("GITHUB_USER or GITHUB_TOKEN not found. Please ensure the .env file is properly configured.") + + return user, token def check_repository_files(api: GhApi, repo_owner: str, repo_name: str) -> bool: """ Check if a repository contains `CONTRIBUTING.md` and `CODE_OF_CONDUCT.md` files. - - This function searches in the following directories: - - Root directory - - `.github/` folder - - `docs/` folder - - Args: - api (GhApi): Authenticated GitHub API client. - repo_owner (str): The GitHub repository owner's username. - repo_name (str): The GitHub repository name. - - Returns: - bool: True if at least one of the files is found, otherwise False. """ paths_to_check = [ "CONTRIBUTING.md", @@ -57,46 +53,50 @@ def check_repository_files(api: GhApi, repo_owner: str, repo_name: str) -> bool: def check_rate_limit(api: GhApi) -> bool: """ - Check if the GitHub API rate limit has been reached. If the rate limit is - exceeded, the script sleeps until the rate limit resets. - - Args: - api (GhApi): Authenticated GitHub API client. - - Returns: - bool: True if the rate limit has been reached and the script is sleeping. + Check if the GitHub API rate limit has been reached. If exceeded, the script sleeps. """ rate_limit = api.rate_limit.get() remaining = rate_limit.resources.core.remaining reset_time = rate_limit.resources.core.reset if remaining == 0: - reset_timestamp = reset_time.timestamp() current_time = time.time() - sleep_time = reset_timestamp - current_time + 60 # Adding 1 minute buffer + sleep_time = reset_time - current_time + 60 # Adding 1 minute buffer print(f"Rate limit reached. Sleeping for {int(sleep_time // 60)} minutes...") time.sleep(sleep_time) return True return False -def process_repositories(input_csv: str, output_csv: str, token: str) -> None: +def process_repository(api: GhApi, row): """ - Processes each GitHub repository from the input CSV file, checks for the - presence of `CONTRIBUTING.md` and `CODE_OF_CONDUCT.md` files, and saves - the results in an output CSV file. The function handles rate limiting by - sleeping when the limit is exceeded. - - Args: - input_csv (str): Path to the input CSV file containing repository URLs. - output_csv (str): Path to the output CSV file where results will be saved. - token (str): GitHub Personal Access Token for authentication. - - Returns: - None + Process a single repository: Check for files and return the result. + """ + html_url = row.get('html_url') + if not html_url or "github.com" not in html_url: + return None, None # Skip non-GitHub URLs + + try: + # Parse repository owner and name + parts = html_url.replace("https://github.com/", "").split("/") + if len(parts) < 2: + return None, None # Invalid repository URL + repo_owner, repo_name = parts[0], parts[1] + + # Check for files + has_contributing = check_repository_files(api, repo_owner, repo_name) + has_code_of_conduct = check_repository_files(api, repo_owner, repo_name) + return has_contributing, has_code_of_conduct + except Exception as e: + print(f"Error processing repository: {html_url} ({e})") + return None, None + + +def process_repositories(input_csv: str, output_csv: str, user: str, token: str, batch_size: int = 10, max_threads: int = 5) -> None: + """ + Processes repositories in batches with partial saving and parallel processing. """ try: - # Try reading the CSV with a semicolon delimiter df = pd.read_csv(input_csv, delimiter=';', encoding='utf-8') except UnicodeDecodeError: print(f"Error reading {input_csv} with UTF-8 encoding. Trying ISO-8859-1...") @@ -107,73 +107,60 @@ def process_repositories(input_csv: str, output_csv: str, token: str) -> None: print(f"Error: The 'html_url' column is missing in the input CSV file.") return - # Add columns for results - df['has_contributing'] = None - df['has_code_of_conduct'] = None - - api = GhApi(token=token) + # Add result columns if not present + if 'has_contributing' not in df.columns: + df['has_contributing'] = None + if 'has_code_of_conduct' not in df.columns: + df['has_code_of_conduct'] = None + api = GhApi(owner=user, token=token) total_repos = len(df) - completed = 0 - for index, row in df.iterrows(): - try: - # Check rate limit before processing each repository + with ThreadPoolExecutor(max_threads) as executor: + futures = {} + for index, row in df.iterrows(): + # Check rate limit before submitting tasks if check_rate_limit(api): - print(f"Rate limit reset, proceeding with next repository.") - - # Skip non-GitHub URLs - if "github.com" not in row['html_url']: - print(f"Skipping non-GitHub domain: {row['html_url']}") - continue + print("Rate limit reset. Continuing...") - # Parse repository owner and name from URL - parts = row['html_url'].replace("https://github.com/", "").split("/") - if len(parts) < 2: + # Skip if already processed + if pd.notnull(df.at[index, 'has_contributing']) and pd.notnull(df.at[index, 'has_code_of_conduct']): continue - repo_owner, repo_name = parts[0], parts[1] - - # Check for files - df.at[index, 'has_contributing'] = check_repository_files(api, repo_owner, repo_name) - df.at[index, 'has_code_of_conduct'] = check_repository_files(api, repo_owner, repo_name) - except Exception as e: - print(f"Skipping repository due to error: {row['html_url']} ({e})") - continue - finally: - completed += 1 - print(f"Processed: {completed}/{total_repos}") - - # Save results to output CSV + + # Log repository being processed + print(f"Processing repository: {row['html_url']} (Index: {index})") + + futures[executor.submit(process_repository, api, row)] = index + + completed = 0 + for future in as_completed(futures): + index = futures[future] + try: + has_contributing, has_code_of_conduct = future.result() + df.at[index, 'has_contributing'] = has_contributing + df.at[index, 'has_code_of_conduct'] = has_code_of_conduct + except Exception as e: + print(f"Error processing index {index}: {e}") + finally: + completed += 1 + print(f"Processed: {completed}/{total_repos}") + + # Save partial results every batch_size + if completed % batch_size == 0: + df.to_csv(output_csv, index=False) + print(f"Partial results saved at {completed}/{total_repos}") + + # Final save df.to_csv(output_csv, index=False) - print(f"Results saved to {output_csv}") + print(f"Final results saved to {output_csv}") def main() -> None: """ Main function to parse command-line arguments and execute the script. - - Parses the input CSV file path, output CSV file path, and GitHub token - from the environment variables or command-line arguments. It then processes - the repositories and checks for the required files. - - Returns: - None """ - # Get the directory of the current script - script_dir = os.path.dirname(os.path.realpath(__file__)) - - # Create the relative path to the .env file - env_path = os.path.join(script_dir, '..', '..', '..', '..', '.env') - - # Load the .env file - load_dotenv(dotenv_path=env_path, override=True) - - # Get the GITHUB_TOKEN from the .env file - token = os.getenv('GITHUB_TOKEN') - - if not token: - print("GitHub token not found in .env file.") - return + # Load the GitHub user and token + user, token = load_credentials() # Command-line argument parsing parser = argparse.ArgumentParser(description="Check for CONTRIBUTING.md and CODE_OF_CONDUCT.md in GitHub repositories.") @@ -182,7 +169,7 @@ def main() -> None: args = parser.parse_args() # Process repositories - process_repositories(args.input, args.output, token) + process_repositories(args.input, args.output, user, token) if __name__ == "__main__":