python · data collection · HTML (crawler 3)

Import plug-in tool

xpath (XML Path Language) is a language for finding information in XML and HTML documents. It can be used to traverse elements and attributes in XML and HTML documents.
xpath helper: returns information about the link
json handle: parsing json files

XPath syntax

Select node:

expressiondescribeExampleresult
nodenameSelect all children of this nodebookstoreSelect all child nodes under the bookstore
/If it is at the front, it means to select from the root node. Otherwise, select a node under a node/bookstoreSelect all bookstore nodes under the root element
//Select a node from the global node, anywhere//bookFind all book nodes from the global node
@Select the attribute of a node//book[@price]Select all book nodes with the price attribute
.Current node./aSelect the a label under the current node

Knowledge points needing attention:

  1. /The difference between / / and / / means that only direct child nodes are obtained// Gets the descendant node. Generally / / used a lot. Of course, it depends.

  2. Contains: sometimes a property contains multiple values, you can use the contains function. The example code is as follows:

    //div[contains(@class,'job_detail')]
    
  3. Subscripts in predicates start with 1, not 0.

predicate:

The predicate is used to find a specific node or a node containing a specified value, which is embedded in square brackets.
In the following table, we list some path expressions with predicates and the results of the expressions:

Path expressiondescribe
/bookstore/book[1]Select the first child element under the bookstore
/bookstore/book[last()]Select the penultimate book element under the bookstore.
bookstore/book[position()❤️]Select the first two child elements under the bookstore.
//book[@price]Select the book element with the price attribute
//book[@price=10]Select all book elements with the attribute price equal to 10
from lxml import etree

parser = etree.HTMLParser(encoding='utf-8')
html = etree.parse('tencent.html',parser=parser)
# print(etree.tostring(html,encoding='utf-8').decode('utf-8'))

# 1. Get all tr Tags
trs = html.xpath("//tr") #xpath must return a list. When taking out elements, you must pay attention to the subscript
for tr in trs:
    print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))

# 2. Get the second tr tag
tr = html.xpath("//tr[2]")[0]
print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))

# 3. Get all tr tags with class equal to even
trs = html.xpath("//tr[@class='even']")
trs = html.xpath("//The tr[contains(@class,'hubei')] ") # attribute contains this value
for tr in trs:
    print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))

# 4. Get the href attribute of all a tags
aList = html.xpath("//a/@href")
for a in aList:
    print('http://hr.tencent.com/'+a)

# 5. Obtain all position information (plain text)
trs = html.xpath("//tr[position()>1]")
positions = []
for tr in trs:
    href = tr.xpath(".//a/@href")[0]
    fullurl = 'http://hr.tencent.com/'+href
    title = tr.xpath(".//td[1]//text()")[0]
    category = tr.xpath(".//td[2]/text()")[0]
    number = tr.xpath(".//td[3]/text()")[0]
    city = tr.xpath(".//td[4]/text()")[0]
    pubtime = tr.xpath(".//td[5]/text()")[0]
    position = {
        'title':title,
        'url':fullurl,
        'category':category,
        'number':number,
        'city':city,
        'pubtime':pubtime,
    }
    positions.append(position)
print(positions)

Climb to movie paradise

import requests
from lxml import etree

headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    # 'Host': 'www.dytt8.net',
    # 'Referer':'https://www.dytt8.net/',
}

BASE_DOMAIN = 'https://www.dytt8.net'

def pasrse_page(url):
    """
    Request movie details link send request for details
    :param url:
    :return:
    """
    # print(url)
    movie = {} # Store details of a movie
    res = requests.get(url,headers=headers)
    data = res.content.decode('gbk')
    html = etree.HTML(data)
    title = html.xpath("/html/body/div[1]/div/div[3]/div[3]/div[1]/div[2]/div[1]/h1/font//text()")[0]
    movie['title'] = title
    Zoom = html.xpath("//div[@id='Zoom']")[0]
    cover = Zoom.xpath(".//img/@src")
    movie['cover'] = cover
    infos = Zoom.xpath(".//text()")
    for index,info in enumerate(infos):
        # print(index,info)
        # print("="*50)
        if info.startswith("◎year  generation"):
            year = info.replace("◎year  generation","").strip()
            movie['year'] = year
        elif info.startswith("◎yield  land"):
            country = info.replace("◎yield  land","").strip()
            # print(country)
            movie['country'] = country
        elif info.startswith("◎class  other"):
            category = info.replace("◎class  other","").strip()
            # print(category)
            movie['category'] = category
        elif info.startswith("Douban score"):
            rating = info.replace("Douban score","").strip()
            # print(rating)
            movie['rating'] = rating
        elif info.startswith("◎Douban score"):
            rating = info.replace("◎Douban score","").strip()
            # print(rating)
            movie['rating'] = rating
        elif info.startswith("◎slice  long"):
            duration = info.replace("◎slice  long","").strip()
            # print(duration)
            movie['duration'] = duration
        elif info.startswith("◎guide  Play"):
            duration = info.replace("◎guide  Play","").strip()
            # print(duration)
            movie['duration'] = duration
        elif info.startswith("◎main  Play"):
            info = info.replace("◎main  Play","").strip()
            # print(actor)
            actors = [info]
            for x in range(index+1,len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            movie['actors'] = actors
            # print(actors)
        elif info.startswith("◎simple  Introduce"):
            info = info.replace("◎simple  Introduce","").strip()
            for x in range(index+1,len(infos)):
                profile = infos[x].strip()
                if profile.startswith("magnetism"):
                    break
                movie['profile'] = profile
                # print(profile)
    download_url = Zoom.xpath('.//a/@href')[0]
    movie['download_url'] = download_url
    # print(movie)
    return movie
def get_detail_urls(url):
    """
    Get a link to the details of each movie on this page
    :param url:
    :return: Returns a list containing links to the details of each movie
    """
    res = requests.get(url,headers=headers)
    # print(res.text)
    text = res.text
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = list(map(lambda url:BASE_DOMAIN+url,detail_urls))
    # print(detail_urls)
    return detail_urls
movies = []
for x in range(1,51):
    url = f"https://www.dytt8.net/html/gndy/dyzz/list_23_{x}.html"
    detail_urls = get_detail_urls(url)
    for detail_url in detail_urls:
        # print(detail_url)
        movie = pasrse_page(detail_url)
        print(movie)
        # movies.append(movie)
        # print(movies)

Tags: html Python xpath

Posted by tecate1 on Tue, 18 Jan 2022 08:03:48 +1030