openpyxl module knowledge supplement - reading data
openpyxl is not very convenient for reading excel files, and pandas module optimizes the way of reading
1. View the names of all workbooks in excel file
load_workbook.sheetnames
from openpyxl import Workbook,load_workbook wb = Workbook() ws1 = wb.create_sheet("Student information form") ws2 = wb.create_sheet("Student transcript") ws1.append(["full name","Age","Gender"]) ws1.append(["Zhang San",18,"male"]) ws1.append(["Li Si",19,"female"]) ws1.append(["Lao Wang",20,"male"]) ws1.append(["Xiaohei",23,"male"]) wb.save("Student information form.xlsx") wd = load_workbook(r'Student information form.xlsx') print(wd.sheetnames) # ['Sheet ',' student information table ',' student transcript ']
2. View the maximum row and column of the workbook of the specified excel file
worksheet.max_row , worksheet.max_column
wd = load_workbook(r'Student information form.xlsx') wd1 = wd['Student information form'] print(wd1) # <worksheet "student information sheet" > print(wd1.max_row) # 5 print(wd1.max_column) # 3
3. Read specific values - two ways
The first method: specify location information Value the second way: worksheet Cell (row, column) value
wd = load_workbook(r'Student information form.xlsx') wd1 = wd['Student information form'] print(wd1['A1'].value) # full name print(wd1.cell(row = 3,column = 2).value) # 19
4. View all rows and columns of the workbook specified in the excel file
worksheet.rows(columns) are nested with a for loop, and value read out
wd = load_workbook(r'Student information form.xlsx') wd1 = wd['Student information form'] print(wd1.rows) # <generator object Worksheet._cells_by_row at 0x00000255FB87B4C0> for i in wd1.rows: # print(i) # (<cell 'student information table'.a1>, <cell' student information table'.b1>, <cell 'student information table'.c1>) print([j.value for j in i ]) # ['name', 'age', 'gender'] ['Zhang San', 18, 'male'] for j in wd1.columns: print(j) # (<cell 'student information table'.a1>, <cell' student information table'.a2>, <cell 'student information table'.a3>, <cell' student information table'.a4>, <cell 'student information table'.a5>) print([i.value for i in j]) # ['full name', 'Zhang San', 'Li Si', 'Lao Wang', 'Xiaohei'] # ['age', 18, 19, 20, 23]
pandas module
pandas module has powerful functions, and it is more concise and convenient to create excel files
import pandas d = { 'full name':['Zhang San','Li Si','Lao Wang','Xiaohei'], "Age":[18,19,20,23], "Gender":['male','female','male','male'] } df = pandas.DataFrame(d) df.to_excel(r"Student information.xlsx")
Actual battle of web crawler -- crawling chain home second-hand housing data
https://sh.lianjia.com/ershoufang/
import requests import re import pandas res1 = requests.get("https://sh.lianjia.com/ershoufang/") with open(r'lianjia.html','wb')as f: f.write(res1.content) with open(r'lianjia.html','r',encoding='utf8') as f: data = f.read() home_title = re.findall('<a class="" href=".*?" target="_blank" data-log_index=".*?" data-el=".*?" data-housecode=".*?" data-is_focus="" data-sl="">(.*?)</a>',data) home_name = re.findall('<a href=".*?" target="_blank" data-log_index=".*?" data-el=".*?">(.*?) </a>',data) home_address = re.findall(' - <a href=".*?" target="_blank">(.*?)</a>',data) home_detail_info = re.findall('<span class="houseIcon"></span>(.*?)</div>',data) home_follow_info =re.findall('<div class="followInfo"><span class="starIcon"></span>(.*?)</div>',data) home_all_price =re.findall('<div class=".*?"><i> </i><span class="">(.*?)</span>',data) home_unit_price = re.findall('<div class=".*?" data-hid=".*?" data-rid=".*?" data-price=".*?"><span>(.*?)</span>',data) d = { 'title':home_title, 'Community name':home_name, 'Community address':home_address, 'House details':home_detail_info, 'House collection information':home_follow_info, 'House unit price':home_unit_price, 'Total price of house(ten thousand)':home_all_price } res = pandas.DataFrame(d) res.to_excel(r'Chain house information.xlsx')
Random random number module
1.random()
Returns a random number between 0 and 1
import random print(random.random()) # 0.49781208840423374 print(random.random()) # 0.2810934030122708
2.randint(a,b)
Returns a random integer between a and B
import random print(random.randint(2, 7)) # 5 print(random.randint(2, 7)) # 6
3.choice()
Randomly extract the data value in the container type
import random print(random.choice(['the first prize','second award','third award'])) # third award print(random.choice(['the first prize','second award','third award'])) # second award
4.sample()
Random sampling, number of user-defined samples
import random print(random.sample(['A', 'B', 'C', 'D', 'E', 'F', 'G'], 3)) # ['C', 'G', 'B'] print(random.sample(['A', 'B', 'C', 'D', 'E', 'F', 'G'], 3)) # ['B', 'G', 'F']
5.shuffle()
The data arrangement of container type can be disordered
import random l1 = [2, 3, 4, 5, 6, 7, 8, 9, 10, 'J', 'Q', 'K', 'A', 'king', 'Xiao Wang'] random.shuffle(l1) print(l1) # [3, 4,'q', 10, 5,'a', 8, 2, 'King', 6, 'Xiao Wang', 9,'k', 7,'j']
random module practice
Write python code to generate five digit random verification code (numbers, lowercase letters, uppercase letters)
import random code = '' for i in range(5): res1 = str(random.randint(0,9)) res2 = chr(random.randint(65,90)) res3 = chr(random.randint(97,122)) num = random.choice([res1,res2,res3]) code += num print(code) # BY84f print(code) # bea9P
Write python code to generate random verification code of user-defined digits (numbers, lowercase letters, uppercase letters)
import random def num_code(n): code = '' for i in range(n): res1 = str(random.randint(0,9)) res2 = chr(random.randint(65,90)) res3 = chr(random.randint(97,122)) num = random.choice([res1,res2,res3]) code += num return code print(num_code(4)) # fi8a print(num_code(5)) # jIw2I print(num_code(6)) # 4X1LX0
hashlib encryption module
1.What is encryption The process of transforming plaintext data into ciphertext data after processing 2.Why encrypt Prevent privacy disclosure 3.How to judge whether the current data value has been encrypted Generally, if it is a series of irregular combination of numbers and letters, it is generally the result of encryption 4.Encryption algorithm: encryption strategy for plaintext data ps: The complexity of different encryption algorithms is different, and the result segments are also different Generally, the longer the result after encryption, the more complex the encryption algorithm is 5.Common encryption algorithms: md5,sha Series hmac,base64 6.Generally, the encrypted result cannot be decrypted ps:The so-called anti decryption often stealthily changes the concept: assume in advance what other people's password is, then use various algorithms to calculate the corresponding ciphertext, then construct the corresponding relationship, then compare the ciphertext, and finally map the ciphertext{'Ciphertext 1':123,'Ciphertext 2':321,...}
hashlib code practice
Basic exercises
import hashlib m1 = hashlib.md5() # Choose md5 encryption algorithm as the data encryption strategy m1.update(b'123') # Add clear text data to it. The data must be Bytes res = m1.hexdigest() # Get the result after encryption print(res) # 202cb962ac59075b964b07152d234b70
Salt treatment
Before encrypting the plaintext to be encrypted, add some interfering strings; At the same time, there is also dynamic salting, that is, the interference items are different every time, for example, each time the current time is obtained and each user's user name is intercepted
import hashlib pwd = input("Please enter your bank card password>>>:").strip() m1 = hashlib.md5() m1.update('Bank of China (interference item)'.encode('utf8')) m1.update(pwd.encode('utf8')) res = m1.hexdigest() print(res) # 21c0b48cc5b84847e4bcdb542d34cceb
supplement
As long as the plaintext data is the same, the ciphertext obtained by adopting the same algorithm must be the same
import hashlib m1 = hashlib.md5() m1.update(b'123') m1.update(b'ABC') m1.update(b'dfg') res = m1.hexdigest() print(res) # 42cef12be984c329ff47e5a887d17c4c import hashlib m1 = hashlib.md5() m1.update(b'123ABCdfg') res = m1.hexdigest() print(res) # 42cef12be984c329ff47e5a887d17c4c
Encryption practical application scenario
1.User password encryption: Registration storage encryption, login comparison ciphertext 2.Document security verification After the formal software program is written, it will do a content encryption; The website provides software files to record the ciphertext corresponding to the content of the file; After downloading, the user does not run directly, but encrypts the downloaded content, and then compares whether the ciphertext is consistent twice. If the consistency indicates that the file has not been changed, it can be run; If it is inconsistent, it means that the modified program may be implanted with a virus 3.Large file encryption optimization For example: program file 100 G,How to encrypt if encryption is required Generally, read 100 G The content is then encrypted, which will run very slowly, so it is not 100 G All contents are encrypted, but some are intercepted for encryption, for example, every 500 M Read 30 bytes
subprocess module
subprocess is the simulated cmd command window
import subprocess cmd = input("Enter your instructions:>>>").strip() sub = subprocess.Popen( cmd, shell=True, stdout = subprocess.PIPE, stderr= subprocess.PIPE ) # print(sub.stdout.read()) # Binary data # stdout returns the correct result after executing the command print(sub.stdout.read().decode('gbk')) # The result returned after stderr executes the command and reports an error print(sub.stderr.read().decode('gbk'))
logging module
brief introduction
1.What is a log Logs are similar to historical records 2.Why use logs In order to record the fact that things happen 3.How to use logs 3.1 Log level (5) import logging logging.debug('debug Grade') # 10 logging.info('info Grade') # 20 logging.warning('warning Grade') # 30 log from the warning level by default logging.error('error Grade') # 40 logging.critical('critical Grade') # 50 >>> WARNING:root:warning Grade >>> ERROR:root:error Grade >>> CRITICAL:root:critical Grade 3.2 Basic use import logging file_handler = logging.FileHandler(filename='x1.log', mode='a', encoding='utf-8',) logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S %p', handlers=[file_handler,], level=logging.ERROR ) logging.error('Hello') >>> 2022-07-21 18:04:55 PM - root - ERROR -logging modular: Hello
Components of the log module
There are logger object (log generation), filter object (log filtering, which can be ignored), handler object (log output), format object (log format)
import logging # 1. Log generation (equivalent to raw material preparation) > > >logger object logger = logging.getLogger('Shopping cart record') # 2. Log filtering (reject defective products) > > >filter object (can be ignored) # 3. Log output (finished product) > > >handler object hd1 = logging.FileHandler('a1.log', encoding='utf-8') # Output a1 Log file hd2 = logging.FileHandler('a2.log', encoding='utf-8') # Output to a2 Log file hd3 = logging.StreamHandler() # Output to terminal # 4. Log format (packaging) > > >format object fm1 = logging.Formatter( fmt='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S %p', ) fm2 = logging.Formatter( fmt='%(asctime)s - %(name)s: %(message)s', datefmt='%Y-%m-%d', ) # 5. Bind the handler object to the logger object logger.addHandler(hd1) logger.addHandler(hd2) logger.addHandler(hd3) # 6.handler binding format object hd1.setFormatter(fm1) hd2.setFormatter(fm2) hd3.setFormatter(fm1) # 7. Set log level logger.setLevel(10) # 8. Record log logger.debug('I've been writing for a long time. I'm so tired and hot') ps:When recording logs, we don't need to write them all by ourselves like the above, which is too cumbersome. This module provides a fixed configuration dictionary and can be called directly
Log configuration dictionary
import logging import logging.config # Define log output format start standard_format = '[%(asctime)s][%(threadName)s:%(thread)d][task_id:%(name)s][%(filename)s:%(lineno)d]' \ '[%(levelname)s][%(message)s]' #Where name is the name specified by getlogger simple_format = '[%(levelname)s][%(asctime)s][%(filename)s:%(lineno)d]%(message)s' # Custom file path logfile_path = 'a3.log' # log configuration dictionary LOGGING_DIC = { 'version': 1, 'disable_existing_loggers': False, 'formatters': { 'standard': { 'format': standard_format }, 'simple': { 'format': simple_format }, }, 'filters': {}, # Filter logs 'handlers': { #Log printed to terminal 'console': { 'level': 'DEBUG', 'class': 'logging.StreamHandler', # Print to screen 'formatter': 'simple' }, #Print logs to files, and collect logs of info and above 'default': { 'level': 'DEBUG', 'class': 'logging.handlers.RotatingFileHandler', # Save to file 'formatter': 'standard', 'filename': logfile_path, # log file 'maxBytes': 1024*1024*5, # Log size 5M 'backupCount': 5, 'encoding': 'utf-8', # The coding of log files, no longer have to worry about Chinese log garbled }, }, 'loggers': { #logging.getLogger(__name__) logger configuration obtained '': { 'handlers': ['default', 'console'], # Here, add both handler s defined above, that is, log data is written to the file and printed to the screen 'level': 'DEBUG', 'propagate': True, # Pass up (logger of higher level) }, # When the key does not exist (the key is set to an empty string), the k:v configuration will be used by default # 'shopping cart record ':{ # 'handlers': ['default','console'], # Here, add both handler s defined above, that is, log data is written to the file and printed to the screen # 'level': 'WARNING', # 'propagate': True, # Pass up (logger of higher level) # }, # When the key does not exist (the key is set to an empty string), the k:v configuration will be used by default }, } logging.config.dictConfig(LOGGING_DIC) # Automatically load the configuration in the dictionary logger1 = logging.getLogger('Shopping cart record') logger1.warning('honorific VIP Good evening, customer. Here you are again') logger1 = logging.getLogger('Registration record') logger1.debug('jason login was successful')
Practical application
1.Put the log dictionary in the configuration file(conf In folder setings.py),Because the dictionary data is a fixed configuration of the log module, it hardly needs to be moved after writing once ps: It is recommended to capitalize the variable name in the configuration file 2.Encapsulate the last record in the step of generating log into a function and put it in the public lib In folder common.py In the document def get_logger(msg): # Log logging.config.dictConfig(settings.LOGGING_DIC) # Automatically load the configuration in the dictionary logger1 = logging.getLogger(msg) # logger1.debug(f'{username}login was successful') # Let users write better here return logger1 ps: The reason for the return value is that you can add log levels more flexibly later
task
Thinking about how to crawl the multi page data of second-hand housing of chain home
# first page: https://sh.lianjia.com/ershoufang/pg1/ # Page 2: https://sh.lianjia.com/ershoufang/pg2/ # Page 3: https://sh.lianjia.com/ershoufang/pg3/ # Page 100: https://sh.lianjia.com/ershoufang/pg100/ import requests import re import pandas base_path = "https://sh.lianjia.com/ershoufang/pg%s/" for i in range(1,101): data = requests.get(base_path%i) res1 = requests.get("https://sh.lianjia.com/ershoufang/") with open(r'lianjias.html', 'ab')as f: f.write(res1.content) with open(r'lianjias.html', 'r', encoding='utf8') as f: data = f.read() home_title = re.findall( '<a class="" href=".*?" target="_blank" data-log_index=".*?" data-el=".*?" data-housecode=".*?" data-is_focus="" data-sl="">(.*?)</a>', data) # print(home_title) home_name = re.findall('<a href=".*?" target="_blank" data-log_index=".*?" data-el=".*?">(.*?) </a>', data) # print(home_name) home_address = re.findall(' - <a href=".*?" target="_blank">(.*?)</a>', data) # print(home_address) home_detail_info = re.findall('<span class="houseIcon"></span>(.*?)</div>', data) # print(home_detail_info) # ['2 rooms and 2 halls | 96.46 square meters | north and South | hardbound | middle floor (a total of 6 floors) | board building built in 2007 '] home_follow_info = re.findall('<div class="followInfo"><span class="starIcon"></span>(.*?)</div>', data) # print(home_follow_info) home_all_price = re.findall('<div class=".*?"><i> </i><span class="">(.*?)</span>', data) # print(home_all_price) home_unit_price = re.findall('<div class=".*?" data-hid=".*?" data-rid=".*?" data-price=".*?"><span>(.*?)</span>', data) # print(home_unit_price) d = { 'title': home_title, 'Community name': home_name, 'Community address': home_address, 'House details': home_detail_info, 'House collection information': home_follow_info, 'House unit price': home_unit_price, 'Total price of house(ten thousand)': home_all_price } res = pandas.DataFrame(d) res.to_excel(r'Chain house information(s).xlsx')