,
Look Good to Feel Good !! # importing required library files from html.parser import HTMLParser from urllib.request import urlopen from urllib import parse # Creating a class called LinkParser that inherits some methods from HTMLParser class LinkParser(HTMLParser): # adding some more functionality to HTMLParser def handle_starttag(self, tag, attrs): # checking for the beginning of a link as they are normally present in tag if tag == ‘a’: for (key, value) in attrs: if key == ‘href’: # grabbing new URL and adding the base URL to it so that it can be added to collection of links newUrl = parse.urljoin(self.baseUrl, value) self.links = self.links + [newUrl] # Function to get links that spider() function will call def getLinks(self, url): self.links = [] self.baseUrl = url # Using the urlopen function from the standard Python 3 library response = urlopen(url) if response.getheader(‘Content-Type’)==’text/html’: htmlBytes = response.read() htmlString = htmlBytes.decode(“utf-8”) self.feed(htmlString) return htmlString, self.links else: return “”,[] # spider() function which takes in an URL, a word to find, and no. of pages to search through def spider(url, word, maxPages): pagesToVisit = [url] numberVisited = 0 foundWord = False # Creating a LinkParser to get all links on the page. while numberVisited < maxPages and pagesToVisit != [] and not foundWord: numberVisited = numberVisited +1 # starting from the first page of the collection: url = pagesToVisit[0] pagesToVisit = pagesToVisit[1:] try: print(numberVisited, “Visiting:”, url) parser = LinkParser() data, links = parser.getLinks(url) if data.find(word)>-1: foundWord = True pagesToVisit = pagesToVisit + links print(“ **Success!**”) except: print(“ **Failed!**”) if foundWord: print(“The word”, word, “was found at”, url) else: print(“Word never found”)