-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilter_sample_scripts.py
64 lines (48 loc) · 2.62 KB
/
filter_sample_scripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
from typing import Callable
def filter_files_based_on_conditions(metrics_file:str, conditions:dict[str, Callable], output_file:str) -> pd.DataFrame:
"""This function filterest the list of Python file names based on the provided conditions.
Args:
metrics_file (str): Filename of the .xlsx file containing the metrics of the Python files.
conditions (dict[str, function]): Dictionary of conditions. Keys must match the header names of `metrics_file`. Values must be lambda functions.
output_file (str): Filename of the .xlsx file that lists all filtered files that match all given conditions.
Returns:
pd.DataFrame: The dataframe containing the filtered rows from `metrics_file`. Represents the filtered Python scripts.
"""
# Read metrics from .xlsx file
df = pd.read_excel(metrics_file)
# List to store filtered files
filtered_files = []
# Total number of rows in the dataframe
total_files = len(df)
for i, (_, row) in enumerate(df.iterrows(), start=1):
# Check that all conditions are met for each file
if all(condition(row[column]) for column, condition in conditions.items() if column in row):
filtered_files.append(row)
# Print progress
print(f"Processed {i}/{total_files} files", end='\r')
# Filter columns to include only those referenced in the conditions
condition_columns = list(conditions.keys())
# Create dataframe for filtered files
filtered_df = pd.DataFrame(filtered_files)
# only select columns that exist in the DataFrame
columns_to_save = ["file"] + [col for col in condition_columns if col in filtered_df.columns]
# Save filtered files with only the relevant columns
filtered_df = filtered_df[columns_to_save]
filtered_df.to_excel(output_file, index=False)
print(f" > Filtered {len(filtered_files)} out of {total_files} files and saved to {output_file}")
return filtered_df
# Define conditions for filtering
conditions = {
"lines": lambda x: 30 <= x <= 300, # between 30 and 300 lines
"functions": lambda x: x >= 3, # at least three functions
"functions_with_docstring": lambda x: x >= 3, # at least three functions with a docstring
# add more conditions here if necessary ...
}
# Path to the .xlsx file containing the metrics
metrics_file = "./script_metrics.xlsx"
# Path to the .xlsx file in which to place the names of the filtered scripts
output_file = "./filtered_files.xlsx"
# Get the filtered files based on conditions
filtered_df = filter_files_based_on_conditions(metrics_file, conditions, output_file)
print(f"Done.")