Blame - tools/dev/bin/onos-dep-discover - onos

blob: 489a23f1cf55456b9bd3b46b7c8c7a12c9cb25e5 [file] [log] [blame]

Aaron Kruglikov	041b254	2017-06-13 16:21:19 -0700	[diff] [blame]	1	#!/usr/bin/env python
				2	"""
				3	This is a scraper based series of scripts designed to retrieve the complete set
				4	of deps of a given artifact. This script is brittle, it will USUALLY fetch the
				5	set of dependencies however it may fail in certain cases, including on pages
				6	with unusual url table structures.
				7
				8	In its current configuration it will list the dependencies that were officially
				9	supported at release, it will not note any updates whatsoever.
				10	"""
				11	import urllib.request
				12	import urllib.response
				13
				14
				15	fringe = set()
				16	already_checked = set()
				17	human_readable_deps = set()
				18
				19	url_prefix = "https://mvnrepository.com/artifact/"
				20
				21	compile_deps_string = "<h2>Compile Dependencies ("
				22
				23	table_open_string = "<tbody"
				24	table_close_string = "</tbody>"
				25
				26	table_row_open_string = "<tr>"
				27	table_row_close_string = "</tr>"
				28
				29	table_row_cell_open_string = "<td"
				30	table_row_cell_close_string = "</td>"
				31
				32	unique_identified_prefix = "<a class=\"vbtn release\" href=\"/artifact/"
				33
				34
				35	# url MUST be of the form https://mvnrepository.com/artifact/organization_id/artifact_id/version_id
				36	# This method takes a starting point url and compiles and returns a list of all the dependencies of that artifact
				37	def get_deps_for_artifact(url):
				38	if url_prefix != url[:len(url_prefix)]:
				39	raise ValueError("The url must begin with a valid https address for the maven central repository.")
				40	stripped_url = url.replace(url_prefix, "")
				41	fringe.add(stripped_url)
				42	while len(fringe) != 0:
				43	to_process = fringe.pop()
				44	if to_process in already_checked:
				45	continue
				46	already_checked.add(to_process)
				47	for dep in get_deps_from_page(url_prefix + to_process):
				48	if dep not in already_checked and dep not in fringe:
				49	fringe.add(dep)
				50	return already_checked
				51
				52
				53	# adds all dependencies on the specified page that have not been previously seen to the fringe
				54	# page is expected to be a valid url
				55	def get_deps_from_page(url):
				56	page_deps = set()
				57	# get the string version of this site
				58	html_string = urllib.request.urlopen(url).read().decode()
				59
				60	# Determine how many compile deps
				61	start_index = html_string.find(compile_deps_string) + len(compile_deps_string)
				62	end_index = html_string.find(")</h2>", start_index)
				63	compile_deps_count = int(html_string[start_index: end_index])
				64	# Get the compile deps if any
				65	if compile_deps_count != 0:
				66	table_open_index = html_string.find(table_open_string, end_index)
				67	table_close_index = html_string.find(table_close_string, table_open_index) + len(table_close_string)
				68	compile_deps = get_deps_from_table(html_string, table_open_index, table_close_index, compile_deps_count)
				69	page_deps.update(compile_deps)
				70
				71	return page_deps
				72
				73	#Table is expected to be from the "<tbody>" tag to the "</tbody>" tag.
				74	def get_deps_from_table(html_string, table_open_index, table_close_index, expected_count):
				75	table_deps = set()
				76	start_index = html_string.find(table_row_open_string, table_open_index, table_close_index)
				77
				78	while start_index != -1:
				79	end_index = html_string.find(table_row_close_string, start_index, table_close_index)\
				80	+ len(table_row_close_string)
				81	row_dep = get_dep_from_row(html_string, start_index, end_index)
				82	if row_dep != None:
				83	table_deps.add(row_dep)
				84	start_index = html_string.find(table_row_open_string, end_index, table_close_index)
				85
				86	return table_deps
				87
				88	#The row is expected to be from "<tr" tag to "</tr>" tag, it is expected to contain 5 pairs of matched
				89	#"<td...></td>" tags, the fourth such pair will contain the desired information,
				90	# the 5th will contain updated versions
				91	def get_dep_from_row(html_string, row_start_index, row_end_index):
				92	start_index = row_start_index
				93	end_index = row_end_index
				94	start_index = html_string.find(table_row_cell_open_string, start_index, row_end_index)
				95	end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
				96	+ len(table_row_cell_close_string)
				97	#set the indecies for the fourth "<td>" element
				98	for i in range(3):
				99	start_index = html_string.find(table_row_cell_open_string, end_index, row_end_index)
				100	end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
				101	+ len(table_row_cell_close_string)
				102
				103	return get_dep_from_cell(html_string, start_index, end_index)
				104
				105	def get_dep_from_cell(html_string, cell_start_index, cell_end_index):
				106	start_index = html_string.find(unique_identified_prefix, cell_start_index, cell_end_index)\
				107	+ len(unique_identified_prefix)
				108	end_index = html_string.find("\"", start_index, cell_end_index)
				109	if start_index == (len(unique_identified_prefix) - 1):
				110	return None
				111	return html_string[start_index:end_index]
				112
				113	# Produces the dependency set but returns them in mvn coord style
				114	def get_mvn_coordinates_deps(url):
				115	mvn_coords = set()
				116	for elem in get_deps_for_artifact(url):
				117	artifact_start_index = elem.find("/", 0)
				118	org_id = elem[:artifact_start_index]
				119	version_start_index = elem.find("/", artifact_start_index + 1)
				120	artifact_id = elem[artifact_start_index + 1: version_start_index]
				121	version = elem[version_start_index + 1:]
				122	mvn_coords.add("mvn:" + org_id + ":" +artifact_id + ":" + version)
				123
				124	return mvn_coords
				125
				126
				127	# Prints out the complete set of deps for the specified package(s)
				128	def print_collection(lst):
				129	for elem in lst:
				130	print(elem)
				131	return
				132
				133
				134	def main():
				135	url = input("Please enter the url of the repo whose dependencies you would like?" +
				136	"\n(this should be a fully qualified url\nex: https://mvnrepository." +
				137	"com/artifact/com.google.guava/guava/19.0)")
				138	mvn_coords = input("Would you like maven coordinate output, enter 'y' for yes? (alternately url style paths will be provided)")
				139	if mvn_coords == "y" or mvn_coords == "Y":
				140	print_collection(get_mvn_coordinates_deps(url))
				141	else:
				142	print_collection(get_deps_for_artifact(url))
				143
				144	if __name__ == '__main__':
				145	main()