Source code for mossnet.build
#! /usr/bin/env python
from mossnet.MossNet import MossNet
from bs4 import BeautifulSoup
from re import search,sub
from sys import stderr
from urllib.request import urlopen
[docs]def build(moss_results_links, verbose=False):
'''Download MOSS results into a ``MossNet`` object
Args:
``moss_results_links`` (``list``): A list of MOSS result URLs
``verbose`` (``bool``): ``True`` to show verbose messages, otherwise ``False``
Returns:
``MossNet``: A ``MossNet`` object
'''
if isinstance(moss_results_links, str):
urls = [l.strip() for l in open(moss_results_links.strip()).read().strip().splitlines()]
else:
urls = [l.strip() for l in moss_results_links]
links = dict()
for url_num,url in enumerate(urls):
if verbose:
stderr.write("Parsing MOSS report %d of %d...\r" % (url_num+1, len(urls)))
bs = BeautifulSoup(urlopen(url).read().decode(), "lxml")
curr_filename = None
for row in bs.findAll('tr'):
cols = row.findAll('td')
if len(cols) != 3:
continue
try:
moss_url = cols[0].find_all('a', href=True)[0]['href']
except:
stderr.write("Failed to parse row: %s" % row); continue
if curr_filename is None:
curr_filename = cols[0].find_all('a', href=True)[0].text.split('/')[-1].split()[0].strip()
email1,email2 = [cols[i].find_all('a', href=True)[0].text.split('/')[-2] for i in [0,1]]
if email1 not in links:
links[email1] = dict()
if email2 not in links[email1]:
links[email1][email2] = dict()
if email2 not in links:
links[email2] = dict()
if email1 not in links[email2]:
links[email2][email1] = dict()
if curr_filename is None:
raise ValueError("Failed to parse filename from results URL: %s" % url)
if curr_filename in links[email1][email2] or curr_filename in links[email2][email1]:
raise ValueError("File '%s' found for (%s, %s) multiple times" % (curr_filename, email1, email2))
moss_url_base = '/'.join(moss_url.rstrip('/').split('/')[:-1])
main_html = urlopen(moss_url).read().decode()
if email1 not in main_html or email2 not in main_html:
raise RuntimeError("Didn't find the right email addresses in the match URL: %s" % moss_url)
top_url = '%s/%s' % (moss_url_base, main_html.split('<FRAME SRC=')[1].split(' ')[0].replace('"',''))
left_url = '%s/%s' % (moss_url_base, main_html.split('<FRAME SRC=')[2].split(' ')[0].replace('"',''))
right_url = '%s/%s' % (moss_url_base, main_html.split('<FRAME SRC=')[3].split(' ')[0].replace('"',''))
left_percent,right_percent = [int(part.split('(')[-1]) for part in urlopen(top_url).read().decode().split("%")[:2]]
left_html = sub(r'<(A|/A).*?>', "", urlopen(left_url).read().decode().split("<HR>")[1].split("</BODY>")[0].split("<PRE>")[1].split("</PRE>")[0].strip())
right_html = sub(r'<(A|/A).*?>', "", urlopen(right_url).read().decode().split("<HR>")[1].split("</BODY>")[0].split("<PRE>")[1].split("</PRE>")[0].strip())
links[email1][email2][curr_filename] = ((left_percent, left_html), (right_percent, right_html))
links[email2][email1][curr_filename] = ((right_percent, right_html), (left_percent, left_html))
return MossNet(links)