tools/dev/bin/onos-dep-discover - onos - Gitiles

 #!/usr/bin/env python
 """
 This is a scraper based series of scripts designed to retrieve the complete set
 of deps of a given artifact. This script is brittle, it will USUALLY fetch the
 set of dependencies however it may fail in certain cases, including on pages
 with unusual url table structures.

 In its current configuration it will list the dependencies that were officially
 supported at release, it will not note any updates whatsoever.
 """
 import urllib.request
 import urllib.response


 fringe = set()
 already_checked = set()
 human_readable_deps = set()

 url_prefix = "https://mvnrepository.com/artifact/"

 compile_deps_string = "<h2>Compile Dependencies ("

 table_open_string = "<tbody"
 table_close_string = "</tbody>"

 table_row_open_string = "<tr>"
 table_row_close_string = "</tr>"

 table_row_cell_open_string = "<td"
 table_row_cell_close_string = "</td>"

 unique_identified_prefix = "<a class=\"vbtn release\" href=\"/artifact/"


 # url MUST be of the form https://mvnrepository.com/artifact/*organization_id*/*artifact_id*/*version_id*
 # This method takes a starting point url and compiles and returns a list of all the dependencies of that artifact
 def get_deps_for_artifact(url):
     if url_prefix != url[:len(url_prefix)]:
         raise ValueError("The url must begin with a valid https address for the maven central repository.")
     stripped_url = url.replace(url_prefix, "")
     fringe.add(stripped_url)
     while len(fringe) != 0:
         to_process = fringe.pop()
         if to_process in already_checked:
             continue
         already_checked.add(to_process)
         for dep in get_deps_from_page(url_prefix + to_process):
             if dep not in already_checked and dep not in fringe:
                 fringe.add(dep)
     return already_checked


 # adds all dependencies on the specified page that have not been previously seen to the fringe
 # page is expected to be a valid url
 def get_deps_from_page(url):
     page_deps = set()
     # get the string version of this site
     html_string = urllib.request.urlopen(url).read().decode()

     # Determine how many compile deps
     start_index = html_string.find(compile_deps_string) + len(compile_deps_string)
     end_index = html_string.find(")</h2>", start_index)
     compile_deps_count = int(html_string[start_index: end_index])
     # Get the compile deps if any
     if compile_deps_count != 0:
         table_open_index = html_string.find(table_open_string, end_index)
         table_close_index = html_string.find(table_close_string, table_open_index) + len(table_close_string)
         compile_deps = get_deps_from_table(html_string, table_open_index, table_close_index, compile_deps_count)
         page_deps.update(compile_deps)

     return page_deps

 #Table is expected to be from the "<tbody>" tag to the "</tbody>" tag.
 def get_deps_from_table(html_string, table_open_index, table_close_index, expected_count):
     table_deps = set()
     start_index = html_string.find(table_row_open_string, table_open_index, table_close_index)

     while start_index != -1:
         end_index = html_string.find(table_row_close_string, start_index, table_close_index)\
                     + len(table_row_close_string)
         row_dep = get_dep_from_row(html_string, start_index, end_index)
         if row_dep != None:
             table_deps.add(row_dep)
         start_index = html_string.find(table_row_open_string, end_index, table_close_index)

     return table_deps

 #The row is expected to be from "<tr" tag to "</tr>" tag, it is expected to contain 5 pairs of matched
 #"<td...></td>" tags, the fourth such pair will contain the desired information,
 # the 5th will contain updated versions
 def get_dep_from_row(html_string, row_start_index, row_end_index):
     start_index = row_start_index
     end_index = row_end_index
     start_index = html_string.find(table_row_cell_open_string, start_index, row_end_index)
     end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
                 + len(table_row_cell_close_string)
     #set the indecies for the fourth "<td>" element
     for i in range(3):
         start_index = html_string.find(table_row_cell_open_string, end_index, row_end_index)
         end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
                     + len(table_row_cell_close_string)

     return get_dep_from_cell(html_string, start_index, end_index)

 def get_dep_from_cell(html_string, cell_start_index, cell_end_index):
     start_index = html_string.find(unique_identified_prefix, cell_start_index, cell_end_index)\
                   + len(unique_identified_prefix)
     end_index = html_string.find("\"", start_index, cell_end_index)
     if start_index == (len(unique_identified_prefix) - 1):
         return None
     return html_string[start_index:end_index]

 # Produces the dependency set but returns them in mvn coord style
 def get_mvn_coordinates_deps(url):
     mvn_coords = set()
     for elem in get_deps_for_artifact(url):
         artifact_start_index = elem.find("/", 0)
         org_id = elem[:artifact_start_index]
         version_start_index = elem.find("/", artifact_start_index + 1)
         artifact_id = elem[artifact_start_index + 1: version_start_index]
         version = elem[version_start_index + 1:]
         mvn_coords.add("mvn:" + org_id + ":" +artifact_id + ":" + version)

     return mvn_coords


 # Prints out the complete set of deps for the specified package(s)
 def print_collection(lst):
     for elem in lst:
         print(elem)
     return


 def main():
     url = input("Please enter the url of the repo whose dependencies you would like?" +
                 "\n(this should be a fully qualified url\nex: https://mvnrepository." +
                 "com/artifact/com.google.guava/guava/19.0)")
     mvn_coords = input("Would you like maven coordinate output, enter 'y' for yes? (alternately url style paths will be provided)")
     if mvn_coords == "y" or mvn_coords == "Y":
         print_collection(get_mvn_coordinates_deps(url))
     else:
         print_collection(get_deps_for_artifact(url))

 if __name__ == '__main__':
     main()
	#!/usr/bin/env python
	"""
	This is a scraper based series of scripts designed to retrieve the complete set
	of deps of a given artifact. This script is brittle, it will USUALLY fetch the
	set of dependencies however it may fail in certain cases, including on pages
	with unusual url table structures.

	In its current configuration it will list the dependencies that were officially
	supported at release, it will not note any updates whatsoever.
	"""
	import urllib.request
	import urllib.response


	fringe = set()
	already_checked = set()
	human_readable_deps = set()

	url_prefix = "https://mvnrepository.com/artifact/"

	compile_deps_string = "<h2>Compile Dependencies ("

	table_open_string = "<tbody"
	table_close_string = "</tbody>"

	table_row_open_string = "<tr>"
	table_row_close_string = "</tr>"

	table_row_cell_open_string = "<td"
	table_row_cell_close_string = "</td>"

	unique_identified_prefix = "<a class=\"vbtn release\" href=\"/artifact/"


	# url MUST be of the form https://mvnrepository.com/artifact/organization_id/artifact_id/version_id
	# This method takes a starting point url and compiles and returns a list of all the dependencies of that artifact
	def get_deps_for_artifact(url):
	if url_prefix != url[:len(url_prefix)]:
	raise ValueError("The url must begin with a valid https address for the maven central repository.")
	stripped_url = url.replace(url_prefix, "")
	fringe.add(stripped_url)
	while len(fringe) != 0:
	to_process = fringe.pop()
	if to_process in already_checked:
	continue
	already_checked.add(to_process)
	for dep in get_deps_from_page(url_prefix + to_process):
	if dep not in already_checked and dep not in fringe:
	fringe.add(dep)
	return already_checked


	# adds all dependencies on the specified page that have not been previously seen to the fringe
	# page is expected to be a valid url
	def get_deps_from_page(url):
	page_deps = set()
	# get the string version of this site
	html_string = urllib.request.urlopen(url).read().decode()

	# Determine how many compile deps
	start_index = html_string.find(compile_deps_string) + len(compile_deps_string)
	end_index = html_string.find(")</h2>", start_index)
	compile_deps_count = int(html_string[start_index: end_index])
	# Get the compile deps if any
	if compile_deps_count != 0:
	table_open_index = html_string.find(table_open_string, end_index)
	table_close_index = html_string.find(table_close_string, table_open_index) + len(table_close_string)
	compile_deps = get_deps_from_table(html_string, table_open_index, table_close_index, compile_deps_count)
	page_deps.update(compile_deps)

	return page_deps

	#Table is expected to be from the "<tbody>" tag to the "</tbody>" tag.
	def get_deps_from_table(html_string, table_open_index, table_close_index, expected_count):
	table_deps = set()
	start_index = html_string.find(table_row_open_string, table_open_index, table_close_index)

	while start_index != -1:
	end_index = html_string.find(table_row_close_string, start_index, table_close_index)\
	+ len(table_row_close_string)
	row_dep = get_dep_from_row(html_string, start_index, end_index)
	if row_dep != None:
	table_deps.add(row_dep)
	start_index = html_string.find(table_row_open_string, end_index, table_close_index)

	return table_deps

	#The row is expected to be from "<tr" tag to "</tr>" tag, it is expected to contain 5 pairs of matched
	#"<td...></td>" tags, the fourth such pair will contain the desired information,
	# the 5th will contain updated versions
	def get_dep_from_row(html_string, row_start_index, row_end_index):
	start_index = row_start_index
	end_index = row_end_index
	start_index = html_string.find(table_row_cell_open_string, start_index, row_end_index)
	end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
	+ len(table_row_cell_close_string)
	#set the indecies for the fourth "<td>" element
	for i in range(3):
	start_index = html_string.find(table_row_cell_open_string, end_index, row_end_index)
	end_index = html_string.find(table_row_cell_close_string, start_index, row_end_index)\
	+ len(table_row_cell_close_string)

	return get_dep_from_cell(html_string, start_index, end_index)

	def get_dep_from_cell(html_string, cell_start_index, cell_end_index):
	start_index = html_string.find(unique_identified_prefix, cell_start_index, cell_end_index)\
	+ len(unique_identified_prefix)
	end_index = html_string.find("\"", start_index, cell_end_index)
	if start_index == (len(unique_identified_prefix) - 1):
	return None
	return html_string[start_index:end_index]

	# Produces the dependency set but returns them in mvn coord style
	def get_mvn_coordinates_deps(url):
	mvn_coords = set()
	for elem in get_deps_for_artifact(url):
	artifact_start_index = elem.find("/", 0)
	org_id = elem[:artifact_start_index]
	version_start_index = elem.find("/", artifact_start_index + 1)
	artifact_id = elem[artifact_start_index + 1: version_start_index]
	version = elem[version_start_index + 1:]
	mvn_coords.add("mvn:" + org_id + ":" +artifact_id + ":" + version)

	return mvn_coords


	# Prints out the complete set of deps for the specified package(s)
	def print_collection(lst):
	for elem in lst:
	print(elem)
	return


	def main():
	url = input("Please enter the url of the repo whose dependencies you would like?" +
	"\n(this should be a fully qualified url\nex: https://mvnrepository." +
	"com/artifact/com.google.guava/guava/19.0)")
	mvn_coords = input("Would you like maven coordinate output, enter 'y' for yes? (alternately url style paths will be provided)")
	if mvn_coords == "y" or mvn_coords == "Y":
	print_collection(get_mvn_coordinates_deps(url))
	else:
	print_collection(get_deps_for_artifact(url))

	if __name__ == '__main__':
	main()