A web crawler (also known in other terms like ants, automatic indexers, bots, web spiders, web robots or web scutters) is an automated program, or script, that methodically scans or “crawls” through web pages to create an index of the data it is set to look for. This process is called Web crawling or spidering.
There are various uses for web crawlers, but essentially a web crawler is used to collect/mine data from the Internet. Most search engines use it as a means of providing up-to-date data and to find what’s new on the Internet. Analytics companies and market researchers use web crawlers to determine customer and market trends in a given geography. (source)
I wrote a simple web crawler for a particular site for the purpose of data mining in Python. I used Selenium and BeautifulSoup4 for this purpose, probably the best combination in this business.
Following is the complete code, you can also find it on gist. The best part about this code is that it is fail safe and the crawler won’t stop even if it encounters an error. A true crawler in a way.
# -*- coding: utf-8 -*- ''' Created on May 27, 2016 @author: abgupta ''' from selenium.webdriver import Firefox from selenium.webdriver.support.ui import WebDriverWait import time, sys, traceback from HTMLParser import HTMLParser from bs4 import BeautifulSoup class Scraper(object): ''' classdocs ''' def __init__(self): ''' Constructor ''' self.url = 'https://sjobs.brassring.com/TGWebHost/home.aspx?partnerid=25667&siteid=5417' self.base_job_url = 'https://sjobs.brassring.com/TGWebHost/jobdetails.aspx?' self.browser = Firefox() self.first_page_search_opening_id = 'srchOpenLink' self.second_page_search_btn_id = 'ctl00_MainContent_submit2' self.next_link_id = 'yui-pg0-0-next-link' #Spinner def DrawSpinner(self, counter): if counter % 4 == 0: sys.stdout.write("/") elif counter % 4 == 1: sys.stdout.write("-") elif counter % 4 == 2: sys.stdout.write("\\") elif counter % 4 == 3: sys.stdout.write("|") sys.stdout.flush() sys.stdout.write('\b') def first_page(self, url): try: self.browser.get(url) #link = self.browser.find_element_by_link_text('Search openings') link = self.browser.find_element_by_id(self.first_page_search_opening_id) link.click() # wait for the page to load WebDriverWait(self.browser, timeout=100).until( lambda x: x.find_element_by_id(self.second_page_search_btn_id)) except Exception as e: print 'exception= ', str(e) print 'stacktrace= ', traceback.print_exc() print 'Line Number= ' + str(sys.exc_traceback.tb_lineno) def click_search_button(self): #Click search button link = self.browser.find_element_by_id(self.second_page_search_btn_id) link.click() # wait for the page to load WebDriverWait(self.browser, timeout=100).until( lambda x: x.find_element_by_class_name('t_full')) def click_next_button(self): #Click NEXT link = self.browser.find_element_by_id(self.next_link_id) link.click() # wait for the page to load WebDriverWait(self.browser, timeout=100).until( lambda x: x.find_element_by_class_name('t_full')) def get_page_source(self): page_source = self.browser.page_source.decode('utf8') f = open('myhtml.html','a') f.write(page_source) f.close() return page_source def get_job_info(self, new_browser, job_url): try: new_browser.get(job_url) html = new_browser.page_source soup = BeautifulSoup(html, 'html.parser') #Find designation data = soup.find('span', attrs={'id' : 'Designation'}) if data: #print data.text f = open('descriptions.txt','a') f.write(data.text + '\n') f.close() else: pass #Find Qualifications data_ql = soup.find('span', attrs={'id' : 'Qualification'}) if data_ql: #print data_ql.text f = open('descriptions.txt','a') f.write(data_ql.text + '\n') f.close() else: pass except Exception as e: print 'exception= ', str(e) #print 'stacktrace= ', traceback.print_exc() print 'Line Number= ' + str(sys.exc_traceback.tb_lineno) def get_jobs(self): try: h = HTMLParser() html = h.unescape(self.browser.page_source).encode('utf-8').decode('ascii', 'ignore') soup = BeautifulSoup(html, 'html.parser') data = soup.findAll('a', id=lambda x: x and x.startswith('popup')) #print data counter = 0 for a in data: if a.has_attr('href'): counter = counter + 1 self.DrawSpinner(counter) try: self.get_job_info(self.browser, self.base_job_url + a['href'].split('?')[1]) except Exception: continue print counter except Exception as e: print 'exception= ', str(e) #print 'stacktrace= ', traceback.print_exc() print 'Line Number= ' + str(sys.exc_traceback.tb_lineno) def main(self): self.first_page(self.url) self.click_search_button() try: for i in range(1, int(5309/50) + 1): self.get_jobs() f = open('myhtml.html','a') f.write(self.browser.page_source) f.close() self.first_page(self.url) self.click_search_button() for _ in range(1, i+1): self.click_next_button() except Exception as e: print 'exception= ', str(e) #print 'stacktrace= ', traceback.print_exc() print 'Line Number= ' + str(sys.exc_traceback.tb_lineno) if __name__ == '__main__': start_time = time.time() sys.stdout.flush() sys.stdout.write('\b') Scraper().main() sys.stdout.flush() sys.stdout.write('\b') end_time = time.time() print 'Processing Time = ', str(end_time-start_time)