Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: swaps from github API to expanding the assets #907

Merged
merged 3 commits into from
Aug 16, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 87 additions & 2 deletions bioconda_utils/hosters.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,36 @@ def error(self, message: str) -> None:
logger.debug("Error parsing HTML: %s", message)


class IncludeFragmentParser(HTMLParser):
"""Extract include-fragment targets from HTML"""
def __init__(self, link_re: Pattern[str]) -> None:
super().__init__()
self.link_re = link_re
self.matches: List[Mapping[str, Any]] = []

def get_matches(self) -> List[Mapping[str, Any]]:
"""Return matches found for **link_re** in href links"""
return self.matches

def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]) -> None:
if tag == "include-fragment":
for key, val in attrs:
if key == "src":
self.handle_a_href(val)
break

def handle_a_href(self, href: str) -> None:
"""Process href attributes of anchor tags"""
match = self.link_re.search(href)
if match:
data = match.groupdict()
data["href"] = href
self.matches.append(data)

def error(self, message: str) -> None:
logger.debug("Error parsing HTML: %s", message)


# pylint: disable=abstract-method
class HTMLHoster(Hoster):
"""Base for Hosters handling release listings in HTML format"""
Expand Down Expand Up @@ -326,15 +356,70 @@ class GithubBase(OrderedHTMLHoster):
class GithubRelease(GithubBase):
"""Matches release artifacts uploaded to Github"""
link_pattern = r"/{account}/{project}/releases/download/{tag}/{fname}{ext}?"
expanded_assets_pattern = r"https://github.com/{account}/{project}/releases/expanded_assets/{version}"
alt_releases_formats = ["https://api.github.com/repos/{account}/{project}/releases"]

async def get_versions(self, req, orig_version):
# first, try the older version when HTML worked
matches = await super().get_versions(req, orig_version)
if len(matches) > 0:
return matches

# now try the expanded webpage parsing, this may break if the HTML page changes in the future
matches = await self.get_expanded_versions(req, orig_version)
if len(matches) > 0:
return matches

# now try the github API parsing, this will hit the API rate limit
matches = await self.get_api_versions(req, orig_version)
return matches

async def get_expanded_versions(self, req, orig_version):
# this version will parse the releases page and expand sub-pages that are collapsed in the initial download
# this section is basically copied from HTMLHoster, but we need the raw contents of the webpage to look for expanded assets
exclude = set(self.exclude)
vals = {key: val
for key, val in self.vals.items()
if key not in exclude}

# this is the pattern for the expanded assets, which auto-expand when viewed via web
expanded_assets_pattern = replace_named_capture_group(self.expanded_assets_pattern_compiled, vals)
expanded_assets_re = re.compile(expanded_assets_pattern)

# after we expand an asset, we still need to look for the original link pattern within the asset
link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
link_re = re.compile(link_pattern)

result = []
for url in self.releases_urls:
# we cannot use the HrefParser because it's not in an <a> tag
parser = IncludeFragmentParser(expanded_assets_re)
parser.feed(await req.get_text_from_url(url))

# now iterate over each expanded asset we find
for match in parser.get_matches():
# fetch the expansion and look for the primary URL
link_parser = HrefParser(link_re)
link_parser.feed(await req.get_text_from_url(match["href"]))

for lp_match in link_parser.get_matches():
# we found a match in the expansion
result.append({
'link' : urljoin(url, lp_match["href"]),
'version' : lp_match['version']
})

if match["version"] == self.vals["version"]:
# we hit the current version, early exit so we do not fetch every expanded asset on the full page
break

return result

# old version found nothing, try with the alternate github API URLs which return JSON
async def get_api_versions(self, req, orig_version):
# this version searches using the API for releases
# TODO: we basically immediately hit the rate limit with this version, we eventually need some long-term persistent memory
# that can track the etags or last-modified so we do not hit this limit except in the initial spin-up
# more information on etags: https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#conditional-requests
self.releases_urls = [
template.format_map(self.vals)
for template in self.alt_releases_formats
Expand All @@ -349,7 +434,7 @@ async def get_versions(self, req, orig_version):
link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
link_re = re.compile(link_pattern)

# now iterate over the alter release URLs
# now iterate over the alternate release URLs
matches = []
for url in self.releases_urls:
text = await req.get_text_from_url(url)
Expand Down
Loading