blob: 489a23f1cf55456b9bd3b46b7c8c7a12c9cb25e5 [file] [log] [blame]
#!/usr/bin/env python
"""
This is a scraper based series of scripts designed to retrieve the complete set
of deps of a given artifact. This script is brittle, it will USUALLY fetch the
set of dependencies however it may fail in certain cases, including on pages
with unusual url table structures.
In its current configuration it will list the dependencies that were officially
supported at release, it will not note any updates whatsoever.
"""
import urllib.request
import urllib.response
fringe = set()
already_checked = set()
human_readable_deps = set()
url_prefix = "https://mvnrepository.com/artifact/"
compile_deps_string = "<h2>Compile Dependencies ("
table_open_string = "<tbody"
table_close_string = "</tbody>"
table_row_open_string = "<tr>"
table_row_close_string = "</tr>"
table_row_cell_open_string = "<td"
table_row_cell_close_string = "</td>"
unique_identified_prefix = "<a class=\"vbtn release\" href=\"/artifact/"
# url MUST be of the form https://mvnrepository.com/artifact/*organization_id*/*artifact_id*/*version_id*
# This method takes a starting point url and compiles and returns a list of all the dependencies of that artifact
def get_deps_for_artifact(url):
if url_prefix != url[:len(url_prefix)]:
raise ValueError("The url must begin with a valid https address for the maven central repository.")
stripped_url = url.replace(url_prefix, "")
fringe.add(stripped_url)
while len(fringe) != 0:
to_process = fringe.pop()
if to_process in already_checked:
continue
already_checked.add(to_process)
for dep in get_deps_from_page(url_prefix + to_process):
if dep not in already_checked and dep not in fringe:
fringe.add(dep)
return already_checked
# adds all dependencies on the specified page that have not been previously seen to the fringe
# page is expected to be a valid url
def get_deps_from_page(url):
page_deps = set()
# get the string version of this site
html_string = urllib.request.urlopen(url).read().decode()
# Determine how many compile deps
start_index = html_string.find(compile_deps_string) + len(compile_deps_string)
end_index = html_string.find(")</h2>", start_index)
compile_deps_count = int(html_string[start_index: end_index])
# Get the compile deps if any
if compile_deps_count != 0:
table_open_index = html_string.find(table_open_string, end_index)
table_close_index = html_string.find(table_close_string, table_open_index) + len(table_close_string)
compile_deps = get_deps_from_table(html_string, table_open_index, table_close_index, compile_deps_count)
page_deps.update(compile_deps)
return page_deps
#Table is expected to be from the "<tbody>" tag to the "</tbody>" tag.
def get_deps_from_table(html_string, table_open_index, table_close_index, expected_count):
table_deps = set()
start_index = html_string.find(table_row_open_string, table_open_index, table_close_index)
while start_index != -1:
end_index = html_string.find(table_row_close_string, start_index, table_close_index)\
+ len(table_row_close_string)
row_dep = get_dep_from_row(html_string, start_index, end_index)
if row_dep != None:
table_deps.add(row_dep)
start_index = html_string.find(table_row_open_string, end_index, table_close_index)
return table_deps
#The row is expected to be from "<tr" tag to "</tr>" tag, it is expected to contain 5 pairs of matched
#"<td...></td>" tags, the fourth such pair will contain the desired information,
# the 5th will contain updated versions
def get_dep_from_row(html_string, row_start_index, row_end_index):
start_index = row_start_index
end_index = row_end_index
start_index = html_string.find(table_row_cell_open_string, start_index, row_end_index)
end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
+ len(table_row_cell_close_string)
#set the indecies for the fourth "<td>" element
for i in range(3):
start_index = html_string.find(table_row_cell_open_string, end_index, row_end_index)
end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
+ len(table_row_cell_close_string)
return get_dep_from_cell(html_string, start_index, end_index)
def get_dep_from_cell(html_string, cell_start_index, cell_end_index):
start_index = html_string.find(unique_identified_prefix, cell_start_index, cell_end_index)\
+ len(unique_identified_prefix)
end_index = html_string.find("\"", start_index, cell_end_index)
if start_index == (len(unique_identified_prefix) - 1):
return None
return html_string[start_index:end_index]
# Produces the dependency set but returns them in mvn coord style
def get_mvn_coordinates_deps(url):
mvn_coords = set()
for elem in get_deps_for_artifact(url):
artifact_start_index = elem.find("/", 0)
org_id = elem[:artifact_start_index]
version_start_index = elem.find("/", artifact_start_index + 1)
artifact_id = elem[artifact_start_index + 1: version_start_index]
version = elem[version_start_index + 1:]
mvn_coords.add("mvn:" + org_id + ":" +artifact_id + ":" + version)
return mvn_coords
# Prints out the complete set of deps for the specified package(s)
def print_collection(lst):
for elem in lst:
print(elem)
return
def main():
url = input("Please enter the url of the repo whose dependencies you would like?" +
"\n(this should be a fully qualified url\nex: https://mvnrepository." +
"com/artifact/com.google.guava/guava/19.0)")
mvn_coords = input("Would you like maven coordinate output, enter 'y' for yes? (alternately url style paths will be provided)")
if mvn_coords == "y" or mvn_coords == "Y":
print_collection(get_mvn_coordinates_deps(url))
else:
print_collection(get_deps_for_artifact(url))
if __name__ == '__main__':
main()