Skip to content

Commit

Permalink
Packages to be built can now be specified in build_list.csv
Browse files Browse the repository at this point in the history
  • Loading branch information
Vel1khan committed Sep 3, 2023
1 parent 0b1b458 commit 3b5be79
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 88 deletions.
7 changes: 7 additions & 0 deletions pyscripts/vc/build_list.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
groupid,artifactid,version # Header
# FROM RC
com.corgibytes,mrm,1.4.2

# NOT FROM RC WITH TIMESTAMP PROPERTY

# NOT FROM RC WITHOUT TIMESTAMP
112 changes: 46 additions & 66 deletions pyscripts/vc/build_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def __init__(self, db: Database, config: Config):
self.log = logging.getLogger(__name__)
self.db = db
self.config = config
self.db.create_builds_table()
self.db.create_jar_repr_table()
self.db.create_err_table()

# TODO check returncode and .buildinfo manually when package fails,
def build_all(self):
Expand All @@ -32,22 +35,29 @@ def build_all(self):
Central, also build using that.
"""
os.chdir("./temp/builder")
self.db.create_builds_table()
self.db.create_jar_repr_table()
records = self.db.get_pkgs_with_tags()
total = len(records)
self.log.info(f"FOUND {total} with tag and outputTimestamp")
if self.config.BUILD_LIST:
records = self.db.get_pkgs_from_list_with_tags(self.config.BUILD_LIST)
self.log.info(f"FOUND {len(records)} packages to build.")
if len(records) != len(self.config.BUILD_LIST):
raise ValueError(
"Not all packages from build list were found and/or did "
+ "not have all necessary build params in DB."
)
else:
records = self.db.get_pkgs_with_tags()
self.log.info(f"FOUND {len(records)} packages to build.")

for i, record in enumerate(records):
self.log.info(f"Processing {i+1}/{total}")
self.log.info(f"Processing {i+1}/{len(records)}")
pkg = PackageId(record["groupid"], record["artifactid"], record["version"])
buildspecs = self.buildspec_exists(pkg)
if len(buildspecs) > 0:
self.log.debug(f"Buildspec found in {buildspecs[0]}!")
self.build_from_existing(pkg, buildspecs[0])
try:
self.build_from_scratch(pkg, record)
except ValueError as e:
self.log.debug(e)
except ValueError as err:
self.log.debug(err)

# remove folder once all builds for the package are complete
# folder = f"research/{pkg.groupid}-{pkg.artifactid}-{pkg.version}/"
Expand All @@ -68,7 +78,11 @@ def build_from_existing(self, pkg: PackageId, src_buildspec):
try:
build_spec = self.parse_buildspec(buildspec_path)
except ValueError:
self.log.error("Could not parse buildspec!")
self.db.insert_error(
pkg,
None,
f"(BUILDER) Could not parse buildspec with path {buildspec_path}",
)
return
build_result = self.build(buildspec_path)
build_id = self.db.insert_build(build_spec, build_result, from_existing=True)
Expand All @@ -91,15 +105,11 @@ def build_from_scratch(self, pkg: PackageId, record: DictRow):
nline_lf, nline_crlf = get_field(record, "line_ending_lf"), get_field(
record, "line_ending_crlf"
)
build_jdk_spec = self.convert_jdk_version(
get_field(record, "java_version_manifest_3")
)
build_jdk_spec = self.convert_jdk_version(get_field(record, "java_version_manifest_3"))
build_jdk = self.convert_jdk_version(
self.parse_build_jdk(get_field(record, "java_version_manifest_2"))
)
source_jdk_ver = self.convert_jdk_version(
get_field(record, "compiler_version_source")
)
source_jdk_ver = self.convert_jdk_version(get_field(record, "compiler_version_source"))

jdks = []
if build_jdk_spec:
Expand All @@ -108,12 +118,8 @@ def build_from_scratch(self, pkg: PackageId, record: DictRow):
jdks.append(build_jdk)
else:
# build with every LTS version available at package release
jdks.extend(
self.choose_jdk_versions(source_jdk_ver, pub_date, lts_only=True)
)
self.log.info(
f"No compiler JDK version found. Building with versions: {jdks}"
)
jdks.extend(self.choose_jdk_versions(source_jdk_ver, pub_date, lts_only=True))
self.log.info(f"No compiler JDK version found. Building with versions: {jdks}")

if nline_lf and not nline_crlf and not nline_inconsistent:
newlines = ["lf"]
Expand Down Expand Up @@ -146,15 +152,14 @@ def build(self, buildspec_path):
["./rebuild.sh", buildspec_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
)

dir_path = os.path.dirname(buildspec_path)
search_pattern = os.path.join(dir_path, "*.buildcompare")
files = glob.glob(search_pattern)
if len(files) == 0:
return Build_Result(
False, process.stdout.decode(), process.stderr.decode(), None, None
)
return Build_Result(False, process.stdout.decode(), process.stderr.decode(), None, None)
self.log.debug(f"{len(files)} .buildcompare files found:\n{files}")
try:
with open(files[0], "r") as file:
Expand All @@ -181,9 +186,7 @@ def build(self, buildspec_path):

except (FileNotFoundError, KeyError):
self.log.debug("File not found or malformed. Build (probably) failed")
return Build_Result(
False, process.stdout.decode(), process.stderr.decode(), None, None
)
return Build_Result(False, process.stdout.decode(), process.stderr.decode(), None, None)

def buildspec_exists(self, pkg: PackageId) -> list:
"""Given a package, checks whether a buildspec has already been created by the Reproducible
Expand Down Expand Up @@ -212,12 +215,7 @@ def buildspec_exists(self, pkg: PackageId) -> list:

# path without artifactid
relative_path = (
pkg.groupid.replace(".", "/")
+ "/"
+ pkg.artifactid
+ "-"
+ pkg.version
+ ".buildspec"
pkg.groupid.replace(".", "/") + "/" + pkg.artifactid + "-" + pkg.version + ".buildspec"
)
path = os.path.join(base_path, relative_path)
if pkg.artifactid == "7zip":
Expand Down Expand Up @@ -245,9 +243,7 @@ def create_buildspec(
if "{artifactId}" in command
else command,
}
with open(
os.path.join(os.getcwd(), "..", "..", ".buildspec.template"), "r"
) as file:
with open(os.path.join(os.getcwd(), "..", "..", ".buildspec.template"), "r") as file:
content = file.read()
template = Template(content)
rendered = template.render(values)
Expand Down Expand Up @@ -294,9 +290,7 @@ def compare(self, pkg: PackageId, build_id: str, build_result: Build_Result):
self.log.debug("Build fail. Cannot compare JARs!")
return
non_repr_jars = [
fname
for fname in build_result.ko_files
if os.path.splitext(fname)[1] == ".jar"
fname for fname in build_result.ko_files if os.path.splitext(fname)[1] == ".jar"
]
if not non_repr_jars:
self.log.debug("No non-reproducible JARs.")
Expand All @@ -306,32 +300,28 @@ def compare(self, pkg: PackageId, build_id: str, build_result: Build_Result):
files = glob.glob(search_pattern)
if len(files) == 0:
# TODO LOG ERROR to DB
self.log.error(".buildcompare not found, Cannot compare!")
self.db.insert_error(pkg, None, "(COMPARE) .buildcompare not found")
return
buildinfo = files[0]

for jar in non_repr_jars:
reference_path, actual_path = extract_path_buildinfo(pkg, jar, buildinfo)
if reference_path is None or actual_path is None:
self.log.error("Reference or Actual artifact path not found!")
self.db.insert_error(
pkg, None, "(COMPARE) Reference or Actual artifact path not found!"
)
return
try:
hash_mismatches, extra_files, missing_files = compare_jars(
actual_path, reference_path
)
self.db.insert_jar_repr(
build_id, jar, hash_mismatches, missing_files, extra_files
)
self.db.insert_jar_repr(build_id, jar, hash_mismatches, missing_files, extra_files)
self.log.debug(hash_mismatches)
self.log.debug(extra_files)
self.log.debug(missing_files)
except FileNotFoundError:
self.log.exception("Couldn't find one of the files. See stacktrace...")

# for each .jar in
# research/{pkg.groupid}-{pkg.artifactid}-{pkg.version}/buildcache/{pkg.artifactid}/target/,
# compare to the corresponding .jar in reference/ contained within the same dir.
# Compare each file, storing their name, containing archive and whether it is reproducible in the db.
self.db.insert_error(pkg, None, "(COMPARE) Couldn't find one of the archives!")
return

def choose_jdk_versions(self, jdk_src_ver: str, pub_date, lts_only: bool) -> list:
"""Given the source jdk version and the package's publish date, returns
Expand Down Expand Up @@ -365,18 +355,10 @@ def choose_jdk_versions(self, jdk_src_ver: str, pub_date, lts_only: bool) -> lis
}
jdk_rel_dates = {k: pd.to_datetime(v) for k, v in data.items()}

all_vers_after = dict(
dropwhile(lambda kv: kv[0] != jdk_src_ver, jdk_rel_dates.items())
)
vers_at_publish = dict(
takewhile(lambda kv: kv[1] < pub_date, all_vers_after.items())
)
all_vers_after = dict(dropwhile(lambda kv: kv[0] != jdk_src_ver, jdk_rel_dates.items()))
vers_at_publish = dict(takewhile(lambda kv: kv[1] < pub_date, all_vers_after.items()))
if lts_only:
return [
ver
for ver in vers_at_publish
if ver in [jdk_src_ver, "8", "11", "17", "21"]
]
return [ver for ver in vers_at_publish if ver in [jdk_src_ver, "8", "11", "17", "21"]]
else:
return [ver for ver in vers_at_publish]

Expand All @@ -389,16 +371,14 @@ def convert_jdk_version(self, version: str):
else:
return re.sub(r"1\.([0-9]|1[0-9]|20|21)", r"\1", version)

def parse_build_jdk(self, version) -> str:
def parse_build_jdk(self, version):
"""Parses the major JDK version from the highly specific format
returned by the java.version system property.
"""
if not version:
return None

result = re.search(
r"(?:(1\.\d)|[2-9](?=\.\d)|(\d{2}|\d{1}(?![\d\.])))", version
)
result = re.search(r"(?:(1\.\d)|[2-9](?=\.\d)|(\d{2}|\d{1}(?![\d\.])))", version)
if result is None:
self.log.debug(f"COULDN'T PARSE {version}")
return None
Expand All @@ -415,6 +395,6 @@ def parse_build_jdk(self, version) -> str:
def clone_rep_central(self):
clone_dir = "./temp/builder"
url = "https://github.com/Vel1khan/reproducible-central.git"
process = subprocess.run(["git", "clone", url, clone_dir])
process = subprocess.run(["git", "clone", url, clone_dir], check=False)
if process.returncode != 0:
self.log.error("Problem encountered")
34 changes: 25 additions & 9 deletions pyscripts/vc/common/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import yaml
import csv
import logging

import pandas as pd
import yaml
from common.packageId import PackageId


class Config:
def __init__(self) -> None:
Expand All @@ -10,19 +14,31 @@ def __init__(self) -> None:
self.LOG_LEVEL = config["log_level"]
self.DB_CONFIG = config["database"]
self.RUN_LIST = [
key
for dictionary in config["run_config"]
for key, value in dictionary.items()
if value
key for dictionary in config["run_config"] for key, value in dictionary.items() if value
]
self.GITHUB_API_KEY = config["github_api_key"]
if "tag_finder" in self.RUN_LIST and self.GITHUB_API_KEY is None:
raise ValueError("GITHUB API KEY NOT SET!")
self.BUILD_CMD: str = config["build_cmd"]
if "builder" in self.RUN_LIST and not self.BUILD_CMD:
raise ValueError("Build command not set in config. Builder will FAIL!")

build_list_path = config["build_list"]
self.BUILD_LIST: list[PackageId] = (
self.read_build_list(build_list_path) if build_list_path else []
)
self.check_config()

def load_config(self, filename):
with open(filename) as config_file:
config = yaml.safe_load(config_file)
return config

def check_config(self):
if "tag_finder" in self.RUN_LIST and self.GITHUB_API_KEY is None:
raise ValueError("GITHUB API KEY NOT SET!")
if "builder" in self.RUN_LIST and not self.BUILD_CMD:
raise ValueError("Build command not set in config. Builder will FAIL!")

def read_build_list(self, file):
packages: list[PackageId] = []
df = pd.read_csv(file, sep=",", comment="#", index_col=0, skip_blank_lines=True)
for row in df.itertuples():
packages.append(PackageId(row[0], row[1], row[2]))
return packages
32 changes: 28 additions & 4 deletions pyscripts/vc/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,32 @@ def get_pkgs_with_tags(self):
self.logged_execute(query)
return self.cur.fetchall()

def get_pkgs_from_list_with_tags(self, pkg_list: list[PackageId]):
"""Given a list of PackageIds, fetches necessary info to build the packages
from the tags, package and package_list tables. Only fetches packages that
have a tag.
"""
query = f"""
SELECT t.groupid, t.artifactid, t.version, tag_name, release_tag_name,
t.url, java_version_manifest_2,
java_version_manifest_3, compiler_version_source, output_timestamp_prop,
lastmodified, line_ending_lf, line_ending_crlf, line_ending_inconsistent_in_file
FROM {self.TAGS_TABLE} AS t
JOIN {self.PKG_TABLE} p on t.groupid = p.groupid
AND t.artifactid = p.artifactid
AND t.version = p.version
JOIN {self.PKG_LIST_TABLE} pl on t.groupid = pl.groupid
AND t.artifactid = pl.artifactid
AND t.version = pl.version
WHERE t.url IS NOT NULL
AND tag_name IS NOT NULL
AND (t.groupid, t.artifactid, t.version) IN %s;
"""
self.cur.execute(
query, (tuple((pkg.groupid, pkg.artifactid, pkg.version) for pkg in pkg_list),)
)
return self.cur.fetchall()

def create_builds_table(self):
self.logged_execute(
f"""
Expand Down Expand Up @@ -429,10 +455,8 @@ def create_jar_repr_table(self):
)
self.conn.commit()

def insert_jar_repr(
self, build_id, archive, hash_mismatches, missing_files, extra_files
):
self.logged_execute(
def insert_jar_repr(self, build_id, archive, hash_mismatches, missing_files, extra_files):
self.cur.execute(
f"""
INSERT INTO {self.JAR_REPR_TABLE}
(build_id, archive, hash_mismatches, missing_files, extra_files)
Expand Down
13 changes: 10 additions & 3 deletions pyscripts/vc/recreate_build.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import sys

from build_packages import BuildPackages
from common.config import Config
Expand All @@ -8,8 +9,14 @@

def main():
"""
Standalone script used to recreate a certain build given its build_id (DEBUGGING).
Standalone script used to recreate a certain build given its build_id (DEBUGGING). Pass in the
build_id as a command line argument.
"""
if len(sys.argv) != 2:
raise ValueError("Please provide only the build_id as a cmd line argument!")

build_id = sys.argv[1]

config = Config()
db = Database(
config.DB_CONFIG["hostname"],
Expand All @@ -20,8 +27,7 @@ def main():

os.chdir("./temp/builder")
builder = BuildPackages(db, config)
# fetch build by build_id
p = db.get_build_params_by_id(4)
p = db.get_build_params_by_id(build_id)
if not p:
raise ValueError("Build does not exist!")
print(p)
Expand All @@ -31,6 +37,7 @@ def main():
pkg, p["url"], p["tag_name"], p["tool"], p["jdk"], p["newline"], p["command"]
)
result = builder.build(path)
builder.compare(pkg, build_id, result)
print(result.stdout)


Expand Down
Loading

0 comments on commit 3b5be79

Please sign in to comment.