Aaron Kruglikov | 041b254 | 2017-06-13 16:21:19 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | """ |
| 3 | This is a scraper based series of scripts designed to retrieve the complete set |
| 4 | of deps of a given artifact. This script is brittle, it will USUALLY fetch the |
| 5 | set of dependencies however it may fail in certain cases, including on pages |
| 6 | with unusual url table structures. |
| 7 | |
| 8 | In its current configuration it will list the dependencies that were officially |
| 9 | supported at release, it will not note any updates whatsoever. |
| 10 | """ |
| 11 | import urllib.request |
| 12 | import urllib.response |
| 13 | |
| 14 | |
| 15 | fringe = set() |
| 16 | already_checked = set() |
| 17 | human_readable_deps = set() |
| 18 | |
| 19 | url_prefix = "https://mvnrepository.com/artifact/" |
| 20 | |
| 21 | compile_deps_string = "<h2>Compile Dependencies (" |
| 22 | |
| 23 | table_open_string = "<tbody" |
| 24 | table_close_string = "</tbody>" |
| 25 | |
| 26 | table_row_open_string = "<tr>" |
| 27 | table_row_close_string = "</tr>" |
| 28 | |
| 29 | table_row_cell_open_string = "<td" |
| 30 | table_row_cell_close_string = "</td>" |
| 31 | |
| 32 | unique_identified_prefix = "<a class=\"vbtn release\" href=\"/artifact/" |
| 33 | |
| 34 | |
| 35 | # url MUST be of the form https://mvnrepository.com/artifact/*organization_id*/*artifact_id*/*version_id* |
| 36 | # This method takes a starting point url and compiles and returns a list of all the dependencies of that artifact |
| 37 | def get_deps_for_artifact(url): |
| 38 | if url_prefix != url[:len(url_prefix)]: |
| 39 | raise ValueError("The url must begin with a valid https address for the maven central repository.") |
| 40 | stripped_url = url.replace(url_prefix, "") |
| 41 | fringe.add(stripped_url) |
| 42 | while len(fringe) != 0: |
| 43 | to_process = fringe.pop() |
| 44 | if to_process in already_checked: |
| 45 | continue |
| 46 | already_checked.add(to_process) |
| 47 | for dep in get_deps_from_page(url_prefix + to_process): |
| 48 | if dep not in already_checked and dep not in fringe: |
| 49 | fringe.add(dep) |
| 50 | return already_checked |
| 51 | |
| 52 | |
| 53 | # adds all dependencies on the specified page that have not been previously seen to the fringe |
| 54 | # page is expected to be a valid url |
| 55 | def get_deps_from_page(url): |
| 56 | page_deps = set() |
| 57 | # get the string version of this site |
| 58 | html_string = urllib.request.urlopen(url).read().decode() |
| 59 | |
| 60 | # Determine how many compile deps |
| 61 | start_index = html_string.find(compile_deps_string) + len(compile_deps_string) |
| 62 | end_index = html_string.find(")</h2>", start_index) |
| 63 | compile_deps_count = int(html_string[start_index: end_index]) |
| 64 | # Get the compile deps if any |
| 65 | if compile_deps_count != 0: |
| 66 | table_open_index = html_string.find(table_open_string, end_index) |
| 67 | table_close_index = html_string.find(table_close_string, table_open_index) + len(table_close_string) |
| 68 | compile_deps = get_deps_from_table(html_string, table_open_index, table_close_index, compile_deps_count) |
| 69 | page_deps.update(compile_deps) |
| 70 | |
| 71 | return page_deps |
| 72 | |
| 73 | #Table is expected to be from the "<tbody>" tag to the "</tbody>" tag. |
| 74 | def get_deps_from_table(html_string, table_open_index, table_close_index, expected_count): |
| 75 | table_deps = set() |
| 76 | start_index = html_string.find(table_row_open_string, table_open_index, table_close_index) |
| 77 | |
| 78 | while start_index != -1: |
| 79 | end_index = html_string.find(table_row_close_string, start_index, table_close_index)\ |
| 80 | + len(table_row_close_string) |
| 81 | row_dep = get_dep_from_row(html_string, start_index, end_index) |
| 82 | if row_dep != None: |
| 83 | table_deps.add(row_dep) |
| 84 | start_index = html_string.find(table_row_open_string, end_index, table_close_index) |
| 85 | |
| 86 | return table_deps |
| 87 | |
| 88 | #The row is expected to be from "<tr" tag to "</tr>" tag, it is expected to contain 5 pairs of matched |
| 89 | #"<td...></td>" tags, the fourth such pair will contain the desired information, |
| 90 | # the 5th will contain updated versions |
| 91 | def get_dep_from_row(html_string, row_start_index, row_end_index): |
| 92 | start_index = row_start_index |
| 93 | end_index = row_end_index |
| 94 | start_index = html_string.find(table_row_cell_open_string, start_index, row_end_index) |
| 95 | end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\ |
| 96 | + len(table_row_cell_close_string) |
| 97 | #set the indecies for the fourth "<td>" element |
| 98 | for i in range(3): |
| 99 | start_index = html_string.find(table_row_cell_open_string, end_index, row_end_index) |
| 100 | end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\ |
| 101 | + len(table_row_cell_close_string) |
| 102 | |
| 103 | return get_dep_from_cell(html_string, start_index, end_index) |
| 104 | |
| 105 | def get_dep_from_cell(html_string, cell_start_index, cell_end_index): |
| 106 | start_index = html_string.find(unique_identified_prefix, cell_start_index, cell_end_index)\ |
| 107 | + len(unique_identified_prefix) |
| 108 | end_index = html_string.find("\"", start_index, cell_end_index) |
| 109 | if start_index == (len(unique_identified_prefix) - 1): |
| 110 | return None |
| 111 | return html_string[start_index:end_index] |
| 112 | |
| 113 | # Produces the dependency set but returns them in mvn coord style |
| 114 | def get_mvn_coordinates_deps(url): |
| 115 | mvn_coords = set() |
| 116 | for elem in get_deps_for_artifact(url): |
| 117 | artifact_start_index = elem.find("/", 0) |
| 118 | org_id = elem[:artifact_start_index] |
| 119 | version_start_index = elem.find("/", artifact_start_index + 1) |
| 120 | artifact_id = elem[artifact_start_index + 1: version_start_index] |
| 121 | version = elem[version_start_index + 1:] |
| 122 | mvn_coords.add("mvn:" + org_id + ":" +artifact_id + ":" + version) |
| 123 | |
| 124 | return mvn_coords |
| 125 | |
| 126 | |
| 127 | # Prints out the complete set of deps for the specified package(s) |
| 128 | def print_collection(lst): |
| 129 | for elem in lst: |
| 130 | print(elem) |
| 131 | return |
| 132 | |
| 133 | |
| 134 | def main(): |
| 135 | url = input("Please enter the url of the repo whose dependencies you would like?" + |
| 136 | "\n(this should be a fully qualified url\nex: https://mvnrepository." + |
| 137 | "com/artifact/com.google.guava/guava/19.0)") |
| 138 | mvn_coords = input("Would you like maven coordinate output, enter 'y' for yes? (alternately url style paths will be provided)") |
| 139 | if mvn_coords == "y" or mvn_coords == "Y": |
| 140 | print_collection(get_mvn_coordinates_deps(url)) |
| 141 | else: |
| 142 | print_collection(get_deps_for_artifact(url)) |
| 143 | |
| 144 | if __name__ == '__main__': |
| 145 | main() |