uses Selenium to drive chrome pages, get Taobao information and analyze the results with BeautifulSoup.

pays attention to the loading judgment of the page when using Selenium, as well as the exception handling that loads the timeout.

 import JSON import re from BS4 import BeautifulSoup from selenium import webdriver from selenium.common.exceptions import TimeoutException from import By from import WebDriverWait from import expected_conditions as EC (browser = webdriver.Chrome) # browser needs to be used repeatedly, so take out alone. Set a maximum wait time, waiting for the completion of the loading of wait target = WebDriverWait (browser, 10) def search (keyword): wait # prone loading time is long, so try is used to capture abnormal try: browser.get ('') # loading will take some time, setting the waiting time, waiting for loading # input button load waiting for input = wait.until (# set load target, it is a selector, parameters are needed to choose the way and waiting for loading the contents of EC.presence_of_element_located ((By.CSS_SELECTOR, #q)) # CSS selector and selection #) submit = wait.until (# submit button EC is behind the selection conditions, loading conditions the best is element_to_be_clickable, meaning elements can click on the EC.element_to_be_clickable ((By.CSS _SELECTOR, #J_TSearchForm > > button))) input.send_keys (keyword) # send_keys on the input box to input (#) to Submit search content, go to the next page for page elements # loaded, and returns the number of pages (total = wait.until EC.presence_of_element_located ("#mainsrp-pager (By.CSS_SELECTOR, > div > div > div > # waiting for loading after the completion of the acquisition information of get_products (return) total.text except TimeoutException: # after a timeout request again, so search (DEF) return recursive next_page (page_number): try: # page page input box and button input = wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR #mainsrp-pager > div > div div > div.form & > Gt; input))) = wait.until (submit # submit button (EC.element_to_be_clickable (By.CSS_SELECTOR, "#mainsrp-pager > div > div; > div > div.form > span.btn.J_Submit))) input.clear (input.send_keys) (page_number) ( wait.until (EC.text_to_be_present_in_element # judgment page ((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > UL > > span'), str (page_number)) get_products (except)) TimeoutException: return next_page (page_number) def (get_products): # judge a single page is loaded from wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR,'#mainsrp-itemlist,.Items.Item'))) HTML = browser.page_source # get the page source code, all The # was analyzed using BS soup = BeautifulSoup (HTML,'lxml') items = ('#mainsrp-itemlist.Items.Item') for item in items: image ('.pic.Img') = [0]['data-src'] price = ('.price strong') [0].text deal = ('.deal-cnt') [0].text[: -3] title = ('.title') [0].text.strip (shop) = ('.shop'[0].text.strip (location)) = ('.location') [0].text product = {'image': image,'price': price,'deal': deal,'title': title,'shop': shop,'location': location} save_text (product) # def (product) save_text download content: # saved as txt files, a additional write mode, encoding UTF-8 with open ('text.txt','a', encoding='utf-8') as f: # use JSON to convert st dictionary R format, plus a newline (json.dumps f.write (product, ensure_ascii=False) +'n'(f.close)) def (main): # total = search in Taobao search by keyword (' delicacy ') page number total = int (re.compile (# extracted by regular' (d+) '(total).Group (.Search) 1) for I in range) # page (2, total+1): # cycle contains before the tail does not contain next_page (I) browser.close (if) __name__'__main__' (

): Main =

for more details please refer to special "Python" crawling function summary learning.

is the whole content of this article, I hope to help you, and hope that you can support a lot of scripting home.

This paper fixed link: | Script Home | +Copy Link

Article reprint please specify:Python uses Selenium+BeautifulSoup to crawl Taobao search pages | Script Home

You may also be interested in these articles!