Import plug-in tool
xpath (XML Path Language) is a language for finding information in XML and HTML documents. It can be used to traverse elements and attributes in XML and HTML documents.
xpath helper: returns information about the link
json handle: parsing json files
XPath syntax
Select node:
expression | describe | Example | result |
---|---|---|---|
nodename | Select all children of this node | bookstore | Select all child nodes under the bookstore |
/ | If it is at the front, it means to select from the root node. Otherwise, select a node under a node | /bookstore | Select all bookstore nodes under the root element |
// | Select a node from the global node, anywhere | //book | Find all book nodes from the global node |
@ | Select the attribute of a node | //book[@price] | Select all book nodes with the price attribute |
. | Current node | ./a | Select the a label under the current node |
Knowledge points needing attention:
-
/The difference between / / and / / means that only direct child nodes are obtained// Gets the descendant node. Generally / / used a lot. Of course, it depends.
-
Contains: sometimes a property contains multiple values, you can use the contains function. The example code is as follows:
//div[contains(@class,'job_detail')]
-
Subscripts in predicates start with 1, not 0.
predicate:
The predicate is used to find a specific node or a node containing a specified value, which is embedded in square brackets.
In the following table, we list some path expressions with predicates and the results of the expressions:
Path expression | describe |
---|---|
/bookstore/book[1] | Select the first child element under the bookstore |
/bookstore/book[last()] | Select the penultimate book element under the bookstore. |
bookstore/book[position()❤️] | Select the first two child elements under the bookstore. |
//book[@price] | Select the book element with the price attribute |
//book[@price=10] | Select all book elements with the attribute price equal to 10 |
from lxml import etree parser = etree.HTMLParser(encoding='utf-8') html = etree.parse('tencent.html',parser=parser) # print(etree.tostring(html,encoding='utf-8').decode('utf-8')) # 1. Get all tr Tags trs = html.xpath("//tr") #xpath must return a list. When taking out elements, you must pay attention to the subscript for tr in trs: print(etree.tostring(tr,encoding='utf-8').decode('utf-8')) # 2. Get the second tr tag tr = html.xpath("//tr[2]")[0] print(etree.tostring(tr,encoding='utf-8').decode('utf-8')) # 3. Get all tr tags with class equal to even trs = html.xpath("//tr[@class='even']") trs = html.xpath("//The tr[contains(@class,'hubei')] ") # attribute contains this value for tr in trs: print(etree.tostring(tr,encoding='utf-8').decode('utf-8')) # 4. Get the href attribute of all a tags aList = html.xpath("//a/@href") for a in aList: print('http://hr.tencent.com/'+a) # 5. Obtain all position information (plain text) trs = html.xpath("//tr[position()>1]") positions = [] for tr in trs: href = tr.xpath(".//a/@href")[0] fullurl = 'http://hr.tencent.com/'+href title = tr.xpath(".//td[1]//text()")[0] category = tr.xpath(".//td[2]/text()")[0] number = tr.xpath(".//td[3]/text()")[0] city = tr.xpath(".//td[4]/text()")[0] pubtime = tr.xpath(".//td[5]/text()")[0] position = { 'title':title, 'url':fullurl, 'category':category, 'number':number, 'city':city, 'pubtime':pubtime, } positions.append(position) print(positions)
Climb to movie paradise
import requests from lxml import etree headers= { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', # 'Host': 'www.dytt8.net', # 'Referer':'https://www.dytt8.net/', } BASE_DOMAIN = 'https://www.dytt8.net' def pasrse_page(url): """ Request movie details link send request for details :param url: :return: """ # print(url) movie = {} # Store details of a movie res = requests.get(url,headers=headers) data = res.content.decode('gbk') html = etree.HTML(data) title = html.xpath("/html/body/div[1]/div/div[3]/div[3]/div[1]/div[2]/div[1]/h1/font//text()")[0] movie['title'] = title Zoom = html.xpath("//div[@id='Zoom']")[0] cover = Zoom.xpath(".//img/@src") movie['cover'] = cover infos = Zoom.xpath(".//text()") for index,info in enumerate(infos): # print(index,info) # print("="*50) if info.startswith("◎year generation"): year = info.replace("◎year generation","").strip() movie['year'] = year elif info.startswith("◎yield land"): country = info.replace("◎yield land","").strip() # print(country) movie['country'] = country elif info.startswith("◎class other"): category = info.replace("◎class other","").strip() # print(category) movie['category'] = category elif info.startswith("Douban score"): rating = info.replace("Douban score","").strip() # print(rating) movie['rating'] = rating elif info.startswith("◎Douban score"): rating = info.replace("◎Douban score","").strip() # print(rating) movie['rating'] = rating elif info.startswith("◎slice long"): duration = info.replace("◎slice long","").strip() # print(duration) movie['duration'] = duration elif info.startswith("◎guide Play"): duration = info.replace("◎guide Play","").strip() # print(duration) movie['duration'] = duration elif info.startswith("◎main Play"): info = info.replace("◎main Play","").strip() # print(actor) actors = [info] for x in range(index+1,len(infos)): actor = infos[x].strip() if actor.startswith("◎"): break actors.append(actor) movie['actors'] = actors # print(actors) elif info.startswith("◎simple Introduce"): info = info.replace("◎simple Introduce","").strip() for x in range(index+1,len(infos)): profile = infos[x].strip() if profile.startswith("magnetism"): break movie['profile'] = profile # print(profile) download_url = Zoom.xpath('.//a/@href')[0] movie['download_url'] = download_url # print(movie) return movie def get_detail_urls(url): """ Get a link to the details of each movie on this page :param url: :return: Returns a list containing links to the details of each movie """ res = requests.get(url,headers=headers) # print(res.text) text = res.text html = etree.HTML(text) detail_urls = html.xpath("//table[@class='tbspan']//a/@href") detail_urls = list(map(lambda url:BASE_DOMAIN+url,detail_urls)) # print(detail_urls) return detail_urls movies = [] for x in range(1,51): url = f"https://www.dytt8.net/html/gndy/dyzz/list_23_{x}.html" detail_urls = get_detail_urls(url) for detail_url in detail_urls: # print(detail_url) movie = pasrse_page(detail_url) print(movie) # movies.append(movie) # print(movies)