openpyxl knowledge points supplement, pandas module foundation, random module, subprocess module, logging module, exercise

openpyxl module knowledge supplement - reading data

openpyxl is not very convenient for reading excel files, and pandas module optimizes the way of reading

1. View the names of all workbooks in excel file

load_workbook.sheetnames

from openpyxl import Workbook,load_workbook
wb = Workbook()
ws1 = wb.create_sheet("Student information form")
ws2 = wb.create_sheet("Student transcript")
ws1.append(["full name","Age","Gender"])
ws1.append(["Zhang San",18,"male"])
ws1.append(["Li Si",19,"female"])
ws1.append(["Lao Wang",20,"male"])
ws1.append(["Xiaohei",23,"male"])
wb.save("Student information form.xlsx")

wd = load_workbook(r'Student information form.xlsx')
print(wd.sheetnames)  # ['Sheet ',' student information table ',' student transcript ']

2. View the maximum row and column of the workbook of the specified excel file

worksheet.max_row , worksheet.max_column

wd = load_workbook(r'Student information form.xlsx')
wd1 = wd['Student information form']
print(wd1)  # <worksheet "student information sheet" >
print(wd1.max_row)  # 5
print(wd1.max_column)  # 3

3. Read specific values - two ways

The first method: specify location information Value the second way: worksheet Cell (row, column) value

wd = load_workbook(r'Student information form.xlsx')
wd1 = wd['Student information form']
print(wd1['A1'].value)  # full name
print(wd1.cell(row = 3,column = 2).value)  # 19

4. View all rows and columns of the workbook specified in the excel file

worksheet.rows(columns) are nested with a for loop, and value read out

wd = load_workbook(r'Student information form.xlsx')
wd1 = wd['Student information form']
print(wd1.rows)  # <generator object Worksheet._cells_by_row at 0x00000255FB87B4C0>
for i in wd1.rows:
    # print(i)  # (<cell 'student information table'.a1>, <cell' student information table'.b1>, <cell 'student information table'.c1>)
    print([j.value for j in i ]) # ['name', 'age', 'gender'] ['Zhang San', 18, 'male']
for j in wd1.columns:
    print(j)  # (<cell 'student information table'.a1>, <cell' student information table'.a2>, <cell 'student information table'.a3>, <cell' student information table'.a4>, <cell 'student information table'.a5>)
    print([i.value for i in j])  # ['full name', 'Zhang San', 'Li Si', 'Lao Wang', 'Xiaohei']  # ['age', 18, 19, 20, 23]

pandas module

pandas module has powerful functions, and it is more concise and convenient to create excel files

import pandas
d = {
    'full name':['Zhang San','Li Si','Lao Wang','Xiaohei'],
    "Age":[18,19,20,23],
    "Gender":['male','female','male','male']
}
df = pandas.DataFrame(d)
df.to_excel(r"Student information.xlsx")

Actual battle of web crawler -- crawling chain home second-hand housing data

https://sh.lianjia.com/ershoufang/

import requests
import re
import pandas
res1 = requests.get("https://sh.lianjia.com/ershoufang/")
with open(r'lianjia.html','wb')as f:
    f.write(res1.content)
with open(r'lianjia.html','r',encoding='utf8') as f:
    data = f.read()
home_title = re.findall('<a class="" href=".*?" target="_blank" data-log_index=".*?"  data-el=".*?" data-housecode=".*?" data-is_focus="" data-sl="">(.*?)</a>',data)
home_name = re.findall('<a href=".*?" target="_blank" data-log_index=".*?" data-el=".*?">(.*?) </a>',data)
home_address = re.findall('   -  <a href=".*?" target="_blank">(.*?)</a>',data)
home_detail_info = re.findall('<span class="houseIcon"></span>(.*?)</div>',data)
home_follow_info =re.findall('<div class="followInfo"><span class="starIcon"></span>(.*?)</div>',data)
home_all_price =re.findall('<div class=".*?"><i> </i><span class="">(.*?)</span>',data)
home_unit_price = re.findall('<div class=".*?" data-hid=".*?" data-rid=".*?" data-price=".*?"><span>(.*?)</span>',data)

d = {
    'title':home_title,
    'Community name':home_name,
    'Community address':home_address,
    'House details':home_detail_info,
    'House collection information':home_follow_info,
    'House unit price':home_unit_price,
    'Total price of house(ten thousand)':home_all_price
}
res = pandas.DataFrame(d)
res.to_excel(r'Chain house information.xlsx')

Random random number module

1.random()

Returns a random number between 0 and 1

import random
print(random.random())  # 0.49781208840423374
print(random.random())   # 0.2810934030122708

2.randint(a,b)

Returns a random integer between a and B

import random
print(random.randint(2, 7)) # 5
print(random.randint(2, 7))  # 6

3.choice()

Randomly extract the data value in the container type

import random
print(random.choice(['the first prize','second award','third award']))  # third award
print(random.choice(['the first prize','second award','third award']))  # second award

4.sample()

Random sampling, number of user-defined samples

import random
print(random.sample(['A', 'B', 'C', 'D', 'E', 'F', 'G'], 3))  # ['C', 'G', 'B']
print(random.sample(['A', 'B', 'C', 'D', 'E', 'F', 'G'], 3))  # ['B', 'G', 'F']

5.shuffle()

The data arrangement of container type can be disordered

import random
l1 = [2, 3, 4, 5, 6, 7, 8, 9, 10, 'J', 'Q', 'K', 'A', 'king', 'Xiao Wang']
random.shuffle(l1)
print(l1)  # [3, 4,'q', 10, 5,'a', 8, 2, 'King', 6, 'Xiao Wang', 9,'k', 7,'j']

random module practice

Write python code to generate five digit random verification code (numbers, lowercase letters, uppercase letters)

import random
code = ''
for i in range(5):
    res1 = str(random.randint(0,9))
    res2 = chr(random.randint(65,90))
    res3 = chr(random.randint(97,122))
    num = random.choice([res1,res2,res3])
    code += num
print(code)  # BY84f

print(code)  # bea9P

Write python code to generate random verification code of user-defined digits (numbers, lowercase letters, uppercase letters)

import random
def num_code(n):
    code = ''
    for i in range(n):
        res1 = str(random.randint(0,9))
        res2 = chr(random.randint(65,90))
        res3 = chr(random.randint(97,122))
        num = random.choice([res1,res2,res3])
        code += num
    return code

print(num_code(4))  # fi8a
print(num_code(5))  # jIw2I
print(num_code(6))  # 4X1LX0

hashlib encryption module

1.What is encryption
	The process of transforming plaintext data into ciphertext data after processing
2.Why encrypt
	Prevent privacy disclosure
3.How to judge whether the current data value has been encrypted
	Generally, if it is a series of irregular combination of numbers and letters, it is generally the result of encryption
4.Encryption algorithm: encryption strategy for plaintext data
   ps: The complexity of different encryption algorithms is different, and the result segments are also different
       Generally, the longer the result after encryption, the more complex the encryption algorithm is
5.Common encryption algorithms: md5,sha Series hmac,base64
6.Generally, the encrypted result cannot be decrypted
	ps:The so-called anti decryption often stealthily changes the concept: assume in advance what other people's password is, then use various algorithms to calculate the corresponding ciphertext, then construct the corresponding relationship, then compare the ciphertext, and finally map the ciphertext{'Ciphertext 1':123,'Ciphertext 2':321,...}

hashlib code practice

Basic exercises

import hashlib
m1 = hashlib.md5() # Choose md5 encryption algorithm as the data encryption strategy
m1.update(b'123')  # Add clear text data to it. The data must be Bytes
res = m1.hexdigest()  # Get the result after encryption
print(res)  # 202cb962ac59075b964b07152d234b70

Salt treatment

Before encrypting the plaintext to be encrypted, add some interfering strings; At the same time, there is also dynamic salting, that is, the interference items are different every time, for example, each time the current time is obtained and each user's user name is intercepted

import hashlib
pwd = input("Please enter your bank card password>>>:").strip()
m1 = hashlib.md5()
m1.update('Bank of China (interference item)'.encode('utf8'))
m1.update(pwd.encode('utf8'))
res = m1.hexdigest()
print(res)  # 21c0b48cc5b84847e4bcdb542d34cceb

supplement

As long as the plaintext data is the same, the ciphertext obtained by adopting the same algorithm must be the same

import hashlib
m1 = hashlib.md5()
m1.update(b'123')
m1.update(b'ABC')
m1.update(b'dfg')
res = m1.hexdigest()
print(res)  # 42cef12be984c329ff47e5a887d17c4c

import hashlib
m1 = hashlib.md5()
m1.update(b'123ABCdfg')
res = m1.hexdigest()
print(res)  # 42cef12be984c329ff47e5a887d17c4c

Encryption practical application scenario

1.User password encryption: Registration storage encryption, login comparison ciphertext
2.Document security verification
	After the formal software program is written, it will do a content encryption; The website provides software files to record the ciphertext corresponding to the content of the file; After downloading, the user does not run directly, but encrypts the downloaded content, and then compares whether the ciphertext is consistent twice. If the consistency indicates that the file has not been changed, it can be run; If it is inconsistent, it means that the modified program may be implanted with a virus
3.Large file encryption optimization
	For example: program file 100 G,How to encrypt if encryption is required
    Generally, read 100 G The content is then encrypted, which will run very slowly, so it is not 100 G All contents are encrypted, but some are intercepted for encryption, for example, every 500 M Read 30 bytes

subprocess module

subprocess is the simulated cmd command window

import subprocess
cmd = input("Enter your instructions:>>>").strip()
sub = subprocess.Popen(
    cmd,
    shell=True,
    stdout = subprocess.PIPE,
    stderr= subprocess.PIPE
)
# print(sub.stdout.read())  # Binary data
# stdout returns the correct result after executing the command
print(sub.stdout.read().decode('gbk'))   
# The result returned after stderr executes the command and reports an error
print(sub.stderr.read().decode('gbk'))

logging module

brief introduction

1.What is a log
	Logs are similar to historical records
2.Why use logs
	In order to record the fact that things happen
3.How to use logs
	3.1 Log level (5)
    import logging
    logging.debug('debug Grade')  # 10
    logging.info('info Grade')  # 20
    logging.warning('warning Grade')  # 30 log from the warning level by default
    logging.error('error Grade')  # 40
    logging.critical('critical Grade')  # 50
    >>> WARNING:root:warning Grade
    >>> ERROR:root:error Grade
    >>> CRITICAL:root:critical Grade
	3.2 Basic use
    import logging
    file_handler = logging.FileHandler(filename='x1.log', mode='a', encoding='utf-8',)
    logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s -%(module)s:  %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S %p',
        handlers=[file_handler,],
        level=logging.ERROR
    )

    logging.error('Hello') 
    >>> 2022-07-21 18:04:55 PM - root - ERROR -logging modular:  Hello
    

Components of the log module

There are logger object (log generation), filter object (log filtering, which can be ignored), handler object (log output), format object (log format)

import logging

# 1. Log generation (equivalent to raw material preparation) > > >logger object
logger = logging.getLogger('Shopping cart record')
# 2. Log filtering (reject defective products) > > >filter object (can be ignored)
# 3. Log output (finished product) > > >handler object
hd1 = logging.FileHandler('a1.log', encoding='utf-8')  # Output a1 Log file
hd2 = logging.FileHandler('a2.log', encoding='utf-8')  # Output to a2 Log file
hd3 = logging.StreamHandler()  # Output to terminal
# 4. Log format (packaging) > > >format object
fm1 = logging.Formatter(
        fmt='%(asctime)s - %(name)s - %(levelname)s -%(module)s:  %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S %p',
)
fm2 = logging.Formatter(
        fmt='%(asctime)s - %(name)s:  %(message)s',
        datefmt='%Y-%m-%d',
)
# 5. Bind the handler object to the logger object
logger.addHandler(hd1)
logger.addHandler(hd2)
logger.addHandler(hd3)
# 6.handler binding format object
hd1.setFormatter(fm1)
hd2.setFormatter(fm2)
hd3.setFormatter(fm1)
# 7. Set log level
logger.setLevel(10)  
# 8. Record log
logger.debug('I've been writing for a long time. I'm so tired and hot')

ps:When recording logs, we don't need to write them all by ourselves like the above, which is too cumbersome. This module provides a fixed configuration dictionary and can be called directly

Log configuration dictionary

import logging
import logging.config
# Define log output format start
standard_format = '[%(asctime)s][%(threadName)s:%(thread)d][task_id:%(name)s][%(filename)s:%(lineno)d]' \
                  '[%(levelname)s][%(message)s]' #Where name is the name specified by getlogger
simple_format = '[%(levelname)s][%(asctime)s][%(filename)s:%(lineno)d]%(message)s'

# Custom file path
logfile_path = 'a3.log'
# log configuration dictionary
LOGGING_DIC = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': standard_format
        },
        'simple': {
            'format': simple_format
        },
    },
    'filters': {},  # Filter logs
    'handlers': {
        #Log printed to terminal
        'console': {
            'level': 'DEBUG',
            'class': 'logging.StreamHandler',  # Print to screen
            'formatter': 'simple'
        },
        #Print logs to files, and collect logs of info and above
        'default': {
            'level': 'DEBUG',
            'class': 'logging.handlers.RotatingFileHandler',  # Save to file
            'formatter': 'standard',
            'filename': logfile_path,  # log file
            'maxBytes': 1024*1024*5,  # Log size 5M
            'backupCount': 5,
            'encoding': 'utf-8',  # The coding of log files, no longer have to worry about Chinese log garbled
        },
    },
    'loggers': {
        #logging.getLogger(__name__) logger configuration obtained
        '': {
            'handlers': ['default', 'console'],  # Here, add both handler s defined above, that is, log data is written to the file and printed to the screen
            'level': 'DEBUG',
            'propagate': True,  # Pass up (logger of higher level)
        },  # When the key does not exist (the key is set to an empty string), the k:v configuration will be used by default
        # 'shopping cart record ':{
        #     'handlers': ['default','console'],  # Here, add both handler s defined above, that is, log data is written to the file and printed to the screen
        #     'level': 'WARNING',
        #     'propagate': True,  # Pass up (logger of higher level)
        # },  # When the key does not exist (the key is set to an empty string), the k:v configuration will be used by default
    },
}

logging.config.dictConfig(LOGGING_DIC)  # Automatically load the configuration in the dictionary
logger1 = logging.getLogger('Shopping cart record')
logger1.warning('honorific VIP Good evening, customer. Here you are again')
logger1 = logging.getLogger('Registration record')
logger1.debug('jason login was successful')

Practical application

1.Put the log dictionary in the configuration file(conf In folder setings.py),Because the dictionary data is a fixed configuration of the log module, it hardly needs to be moved after writing once
	ps: It is recommended to capitalize the variable name in the configuration file
2.Encapsulate the last record in the step of generating log into a function and put it in the public lib In folder common.py In the document
def get_logger(msg):
    # Log
    logging.config.dictConfig(settings.LOGGING_DIC)  # Automatically load the configuration in the dictionary
    logger1 = logging.getLogger(msg)
    # logger1.debug(f'{username}login was successful')  # Let users write better here
    return logger1
ps: The reason for the return value is that you can add log levels more flexibly later

task

Thinking about how to crawl the multi page data of second-hand housing of chain home

# first page: https://sh.lianjia.com/ershoufang/pg1/
# Page 2: https://sh.lianjia.com/ershoufang/pg2/
# Page 3: https://sh.lianjia.com/ershoufang/pg3/
# Page 100: https://sh.lianjia.com/ershoufang/pg100/
import requests
import re
import pandas
base_path = "https://sh.lianjia.com/ershoufang/pg%s/"
for i in range(1,101):
    data = requests.get(base_path%i)
    res1 = requests.get("https://sh.lianjia.com/ershoufang/")
    with open(r'lianjias.html', 'ab')as f:
        f.write(res1.content)
with open(r'lianjias.html', 'r', encoding='utf8') as f:
    data = f.read()
home_title = re.findall(
    '<a class="" href=".*?" target="_blank" data-log_index=".*?"  data-el=".*?" data-housecode=".*?" data-is_focus="" data-sl="">(.*?)</a>',
    data)
# print(home_title)
home_name = re.findall('<a href=".*?" target="_blank" data-log_index=".*?" data-el=".*?">(.*?) </a>', data)
# print(home_name)
home_address = re.findall('   -  <a href=".*?" target="_blank">(.*?)</a>', data)
# print(home_address)
home_detail_info = re.findall('<span class="houseIcon"></span>(.*?)</div>', data)
# print(home_detail_info) # ['2 rooms and 2 halls | 96.46 square meters | north and South | hardbound | middle floor (a total of 6 floors) | board building built in 2007 ']
home_follow_info = re.findall('<div class="followInfo"><span class="starIcon"></span>(.*?)</div>', data)
# print(home_follow_info)
home_all_price = re.findall('<div class=".*?"><i> </i><span class="">(.*?)</span>', data)
# print(home_all_price)
home_unit_price = re.findall('<div class=".*?" data-hid=".*?" data-rid=".*?" data-price=".*?"><span>(.*?)</span>',
                             data)
# print(home_unit_price)

d = {
    'title': home_title,
    'Community name': home_name,
    'Community address': home_address,
    'House details': home_detail_info,
    'House collection information': home_follow_info,
    'House unit price': home_unit_price,
    'Total price of house(ten thousand)': home_all_price
}
res = pandas.DataFrame(d)
res.to_excel(r'Chain house information(s).xlsx')

Posted by plex303 on Fri, 22 Jul 2022 06:37:09 +0930