Are programmers single dogs? I have a saying that I don't know whether to say it or not
Are programmers single dogs? I have a sentence: how can it be!!! There's no programmer's routine. You can't imagine...
Although most netizens think that programmers are single dogs, in fact, the situation of programmers is often on the side of "dog food"
Programmers also have romance and love. Programmers' love can be more routine
For the goddess that programmers like, programmers can skillfully obtain the goddess's interests, hobbies, buying styles, what they like to eat, what they like to drink... And then "start" from these aspects. The goddess is not easy to catch, so you can't imagine the routine of programmers.
Today, I'm going to be a matchmaker and give some benefits to male programmers...
Today's goal is to crawl the information of girls presented on the marriage online and save it in excel for you to select the girls you like..
The whole army attacked, the target website was start_url
start_url = 'https://www.csflhjw.com/zhenghun/34.html?page=1'
Open the interface
Right click to open the check box for you a Miss Wen's marriage information.. From this, it is judged that it is synchronous loading
Click elements to locate the picture address. In the box are the url address and picture address of the lady
We can see that the url address of the lady is incomplete, and then we need to splice the url in the code to see what changes the url address of page turning has
Click on page 2
https://www.csflhjw.com/zhenghun/34.html?page=2
Click on page 3
https://www.csflhjw.com/zhenghun/34.html?page=3
It can be seen that the change is at the end
Do the fou loop to format the output.. It's 10 pages
Code analysis: 1. Get all the women's url, the path of xpath will not be detailed..
2. Construct the url address of each lady
3. Then click on a lady's url address and use the same method to confirm that it is also loaded synchronously
4. The next step is to extract the html xpath of the lady's url address, print each one, and filter the unwanted ones
5. Finally, the preservation of documents
Print results:
Finally, the complete solution of the code
# !/usr/bin/nev python # -*-coding:utf8-*- import requests, os, csv from pprint import pprint from lxml import etree def main(): for i in range(1, 11): start_url = 'https://www.csflhjw.com/zhenghun/34.html?page={}'.format(i) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/87.0.4280.88 Safari/537.36' } response = requests.get(start_url, headers=headers).content.decode() # # pprint(response) # 3 data analysis html_str = etree.HTML(response) info_urls = html_str.xpath(r'//div[@class="e"]/div[@class="e-img"]/a/@href') # pprint(info_urls) # 4. Loop traversal construction img_info_url for info_url in info_urls: info_url = r'https://www.csflhjw.com' + info_url # print(info_url) # 5. Yes, info_url, parsing img_urls response = requests.get(info_url, headers=headers).content.decode() html_str = etree.HTML(response) # pprint(html_str) img_url = 'https://www.csflhjw.com/' + html_str.xpath(r'/html/body/div[4]/div/div[1]/div[2]/div[1]/div[' r'1]/img/@src')[0] # pprint(img_url) name = html_str.xpath(r'//div[@class="team-info"]/div[@class="team-e"]/h2/text()')[0] # pprint(name) xueli = html_str.xpath(r'//div[@class="team-info"]/div[@class="team-e"]/p[1]/text()')[0].split(': ')[1] # pprint(xueli) job = html_str.xpath(r'//div[@class="team-info"]/div[@class="team-e"]/p[2]/text()')[0].split(': ')[1] # pprint(job) marital_status = html_str.xpath(r'//div[@class="team-info"]/div[@class="team-e"]/p[3]/text()')[0].split( ': ')[1] # pprint(marital_status) is_child = html_str.xpath(r'//div[@class="team-info"]/div[@class="team-e"]/p[4]/text()')[0].split(': ')[1] # pprint(is_child) home = html_str.xpath(r'//div[@class="team-info"]/div[@class="team-e"]/p[5]/text()')[0].split(': ')[1] # pprint(home) workplace = html_str.xpath(r'//div[@class="team-info"]/div[@class="team-e"]/p[6]/text()')[0].split(': ')[1] # pprint(workplace) requ = html_str.xpath(r'/html/body/div[4]/div/div[1]/div[2]/div[2]/div[2]/p[2]/span/text()')[0].split(': ')[1] # pprint(requ) requ = [requ if requ != str() else 'No requirement'][0] monologue = html_str.xpath(r'//div[@class="hunyin-1-3"]/p/text()') # pprint(monologue) monologue = [monologue[0].replace(' ', '').replace('\xa0', '') if monologue !=list() else 'nothing'][0] # pprint(monologue) zeo_age = html_str.xpath(r'/html/body/div[4]/div/div[1]/div[2]/div[2]/div[2]/p[1]/span[1]/text()')[0].split(': ')[1] zeo_age = [zeo_age if zeo_age!=str() else 'No requirement'][0] # pprint(zeo_age) zeo_address = html_str.xpath(r'/html/body/div[4]/div/div[1]/div[2]/div[2]/div[2]/p[1]/span[2]/text()')[0].split(': ')[1] zeo_address = [zeo_address if zeo_address!=str() else 'No requirement'][0] # pprint(zeo_address) if not os.path.exists(r'./{}'.format('Sister information data')): os.mkdir(r'./{}'.format('Sister information data')) csv_header = ['full name', 'education', 'occupation', 'marital status', 'With or without children', 'Whether to buy a house or not', 'Place of work', 'Age of mate selection', 'Mate City', 'Mate selection requirements', 'personal soliloquy ', 'Photo links'] with open(r'./{}/{}.csv'.format('Sister information data', 'Sister data'), 'w', newline='', encoding='gbk') as file_csv: csv_writer_header = csv.DictWriter(file_csv, csv_header) csv_writer_header.writeheader() try: with open(r'./{}/{}.csv'.format('Sister information data', 'Sister data'), 'a+', newline='', encoding='gbk') as file_csv: csv_writer = csv.writer(file_csv, delimiter=',') csv_writer.writerow([name, xueli, job, marital_status, is_child, home, workplace, zeo_age, zeo_address, requ, monologue, img_url]) print(r'***Sister information data:{}'.format(name)) except Exception as e: with open(r'./{}/{}.csv'.format('Sister information data', 'Sister data'), 'a+', newline='', encoding='utf-8') as file_csv: csv_writer = csv.writer(file_csv, delimiter=',') csv_writer.writerow([name, xueli, job, marital_status, is_child, home, workplace, zeo_age, zeo_address, requ, monologue, img_url]) print(r'***Sister information data saved successfully:{}'.format(name)) if __name__ == '__main__': main()