blob: 489a23f1cf55456b9bd3b46b7c8c7a12c9cb25e5 [file] [log] [blame]
Aaron Kruglikov041b2542017-06-13 16:21:19 -07001#!/usr/bin/env python
2"""
3This is a scraper based series of scripts designed to retrieve the complete set
4of deps of a given artifact. This script is brittle, it will USUALLY fetch the
5set of dependencies however it may fail in certain cases, including on pages
6with unusual url table structures.
7
8In its current configuration it will list the dependencies that were officially
9supported at release, it will not note any updates whatsoever.
10"""
11import urllib.request
12import urllib.response
13
14
15fringe = set()
16already_checked = set()
17human_readable_deps = set()
18
19url_prefix = "https://mvnrepository.com/artifact/"
20
21compile_deps_string = "<h2>Compile Dependencies ("
22
23table_open_string = "<tbody"
24table_close_string = "</tbody>"
25
26table_row_open_string = "<tr>"
27table_row_close_string = "</tr>"
28
29table_row_cell_open_string = "<td"
30table_row_cell_close_string = "</td>"
31
32unique_identified_prefix = "<a class=\"vbtn release\" href=\"/artifact/"
33
34
35# url MUST be of the form https://mvnrepository.com/artifact/*organization_id*/*artifact_id*/*version_id*
36# This method takes a starting point url and compiles and returns a list of all the dependencies of that artifact
37def get_deps_for_artifact(url):
38 if url_prefix != url[:len(url_prefix)]:
39 raise ValueError("The url must begin with a valid https address for the maven central repository.")
40 stripped_url = url.replace(url_prefix, "")
41 fringe.add(stripped_url)
42 while len(fringe) != 0:
43 to_process = fringe.pop()
44 if to_process in already_checked:
45 continue
46 already_checked.add(to_process)
47 for dep in get_deps_from_page(url_prefix + to_process):
48 if dep not in already_checked and dep not in fringe:
49 fringe.add(dep)
50 return already_checked
51
52
53# adds all dependencies on the specified page that have not been previously seen to the fringe
54# page is expected to be a valid url
55def get_deps_from_page(url):
56 page_deps = set()
57 # get the string version of this site
58 html_string = urllib.request.urlopen(url).read().decode()
59
60 # Determine how many compile deps
61 start_index = html_string.find(compile_deps_string) + len(compile_deps_string)
62 end_index = html_string.find(")</h2>", start_index)
63 compile_deps_count = int(html_string[start_index: end_index])
64 # Get the compile deps if any
65 if compile_deps_count != 0:
66 table_open_index = html_string.find(table_open_string, end_index)
67 table_close_index = html_string.find(table_close_string, table_open_index) + len(table_close_string)
68 compile_deps = get_deps_from_table(html_string, table_open_index, table_close_index, compile_deps_count)
69 page_deps.update(compile_deps)
70
71 return page_deps
72
73#Table is expected to be from the "<tbody>" tag to the "</tbody>" tag.
74def get_deps_from_table(html_string, table_open_index, table_close_index, expected_count):
75 table_deps = set()
76 start_index = html_string.find(table_row_open_string, table_open_index, table_close_index)
77
78 while start_index != -1:
79 end_index = html_string.find(table_row_close_string, start_index, table_close_index)\
80 + len(table_row_close_string)
81 row_dep = get_dep_from_row(html_string, start_index, end_index)
82 if row_dep != None:
83 table_deps.add(row_dep)
84 start_index = html_string.find(table_row_open_string, end_index, table_close_index)
85
86 return table_deps
87
88#The row is expected to be from "<tr" tag to "</tr>" tag, it is expected to contain 5 pairs of matched
89#"<td...></td>" tags, the fourth such pair will contain the desired information,
90# the 5th will contain updated versions
91def get_dep_from_row(html_string, row_start_index, row_end_index):
92 start_index = row_start_index
93 end_index = row_end_index
94 start_index = html_string.find(table_row_cell_open_string, start_index, row_end_index)
95 end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
96 + len(table_row_cell_close_string)
97 #set the indecies for the fourth "<td>" element
98 for i in range(3):
99 start_index = html_string.find(table_row_cell_open_string, end_index, row_end_index)
100 end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
101 + len(table_row_cell_close_string)
102
103 return get_dep_from_cell(html_string, start_index, end_index)
104
105def get_dep_from_cell(html_string, cell_start_index, cell_end_index):
106 start_index = html_string.find(unique_identified_prefix, cell_start_index, cell_end_index)\
107 + len(unique_identified_prefix)
108 end_index = html_string.find("\"", start_index, cell_end_index)
109 if start_index == (len(unique_identified_prefix) - 1):
110 return None
111 return html_string[start_index:end_index]
112
113# Produces the dependency set but returns them in mvn coord style
114def get_mvn_coordinates_deps(url):
115 mvn_coords = set()
116 for elem in get_deps_for_artifact(url):
117 artifact_start_index = elem.find("/", 0)
118 org_id = elem[:artifact_start_index]
119 version_start_index = elem.find("/", artifact_start_index + 1)
120 artifact_id = elem[artifact_start_index + 1: version_start_index]
121 version = elem[version_start_index + 1:]
122 mvn_coords.add("mvn:" + org_id + ":" +artifact_id + ":" + version)
123
124 return mvn_coords
125
126
127# Prints out the complete set of deps for the specified package(s)
128def print_collection(lst):
129 for elem in lst:
130 print(elem)
131 return
132
133
134def main():
135 url = input("Please enter the url of the repo whose dependencies you would like?" +
136 "\n(this should be a fully qualified url\nex: https://mvnrepository." +
137 "com/artifact/com.google.guava/guava/19.0)")
138 mvn_coords = input("Would you like maven coordinate output, enter 'y' for yes? (alternately url style paths will be provided)")
139 if mvn_coords == "y" or mvn_coords == "Y":
140 print_collection(get_mvn_coordinates_deps(url))
141 else:
142 print_collection(get_deps_for_artifact(url))
143
144if __name__ == '__main__':
145 main()