# A collection of 50 Python thesis drawings, from entry to proficiency with code

Share a good python drawing resource today:

https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/

image-20230324232925848

This website provides the most common 50 Python drawing (Matplotlib/Seaborn) methods and codes for scientific research papers.

Grouped according to 7 different uses of visualization goals. For example, if you want to plot the relationship between two variables, look at the graphs under the Correlation section. Or, if you want to show how a value changes over time, check out the Change section, etc.

This article shows some excellent cases, the complete code can be found on the website, and the data can be found below:

# Correlation

Correlograms are used to visualize the relationship between 2 or more variables. That is, how one variable changes relative to another.

## Linear regression fit

The line of best fit is a great way to see how two variables vary with each other. The figure below shows how the line of best fit differs between groups in the data. To disable grouping and only draw a line of best fit for the entire dataset, remove the following parameters. hue='cyl'``sns.lmplot()

`import seaborn as sns# Import Datadf = pd.read_csv("./python/matplotlib-data/mpg_ggplot2.csv")df_select = df.loc[df.cyl.isin([4,8]), :]# Plotsns.set_style("white")gridobj = sns.lmplot(x="displ", y="hwy", hue="cyl", data=df_select,                      height=7, aspect=1.6, robust=True, palette='tab10',                      scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'))# Decorationsgridobj.set(xlim=(0.5, 7.5), ylim=(0, 50))plt.title("Scatterplot with line of best fit grouped by number of cylinders", fontsize=20)plt.show()`

## marginal boxplot

Marginal boxplots serve a similar purpose to marginal histograms. Boxplots help determine the median, 25th, and 75th percentiles of X and Y.

`import pandas as pd# Import Datadf = pd.read_csv("./python/matplotlib-data/mpg_ggplot2.csv")# Create Fig and gridspecfig = plt.figure(figsize=(16, 10), dpi= 80)grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.2)# Define the axesax_main = fig.add_subplot(grid[:-1, :-1])ax_right = fig.add_subplot(grid[:-1, -1], xticklabels=[], yticklabels=[])ax_bottom = fig.add_subplot(grid[-1, 0:-1], xticklabels=[], yticklabels=[])# Scatterplot on main axax_main.scatter('displ', 'hwy', s=df.cty*5, c=df.manufacturer.astype('category').cat.codes, alpha=.9, data=df, cmap="Set1", edgecolors='black', linewidths=.5)# Add a graph in each partsns.boxplot(df.hwy, ax=ax_right, orient="v")sns.boxplot(df.displ, ax=ax_bottom, orient="h")# Decorations ------------------# Remove x axis name for the boxplotax_bottom.set(xlabel='')ax_right.set(ylabel='')# Main Title, Xlabel and YLabelax_main.set(title='Scatterplot with Histograms \n displ vs hwy', xlabel='displ', ylabel='hwy')# Set font size of different componentsax_main.title.set_fontsize(20)for item in ([ax_main.xaxis.label, ax_main.yaxis.label] + ax_main.get_xticklabels() + ax_main.get_yticklabels()):    item.set_fontsize(14)plt.show()`

## correlation diagram

Correlograms are used to visually view correlation measures between all possible pairs of numeric variables in a given two-dimensional array.

`# Import Datasetdf = pd.read_csv("./python/matplotlib-data/mtcars.csv")# Plotplt.figure(figsize=(12,10), dpi= 80)sns.heatmap(df.corr(), xticklabels=df.corr().columns, yticklabels=df.corr().columns, cmap='RdYlGn', center=0, annot=True)# Decorationsplt.title('Correlogram of mtcars', fontsize=22)plt.xticks(fontsize=12)plt.yticks(fontsize=12)plt.show()`

# Deviation

## Lollipop diagram with markers

`# Prepare Datadf = pd.read_csv("./python/matplotlib-data/mtcars.csv")x = df.loc[:, ['mpg']]df['mpg_z'] = (x - x.mean())/x.std()df['colors'] = 'black'# color fiat differentlydf.loc[df.cars == 'Fiat X1-9', 'colors'] = 'darkorange'df.sort_values('mpg_z', inplace=True)df.reset_index(inplace=True)# Draw plotimport matplotlib.patches as patchesplt.figure(figsize=(14,16), dpi= 80)plt.hlines(y=df.index, xmin=0, xmax=df.mpg_z, color=df.colors, alpha=0.4, linewidth=1)plt.scatter(df.mpg_z, df.index, color=df.colors, s=[600 if x == 'Fiat X1-9' else 300 for x in df.cars], alpha=0.6)plt.yticks(df.index, df.cars)plt.xticks(fontsize=12)# Annotateplt.annotate('Mercedes Models', xy=(0.0, 11.0), xytext=(1.0, 11), xycoords='data',             fontsize=15, ha='center', va='center',            bbox=dict(boxstyle='square', fc='firebrick'),            arrowprops=dict(arrowstyle='-[, widthB=2.0, lengthB=1.5', lw=2.0, color='steelblue'), color='white')# Add Patchesp1 = patches.Rectangle((-2.0, -1), width=.3, height=3, alpha=.2, facecolor='red')p2 = patches.Rectangle((1.5, 27), width=.8, height=5, alpha=.2, facecolor='green')plt.gca().add_patch(p1)plt.gca().add_patch(p2)# Decorateplt.title('Diverging Bars of Car Mileage', fontdict={'size':20})plt.grid(linestyle='--', alpha=0.5)plt.show()`

## area chart

The area chart is a relatively new drawing, it is recommended to master

By coloring the area between the axis and the line, the area chart puts more emphasis not only on the peaks and troughs, but also on the duration of the highs and lows. The longer the high lasts, the larger the area below the line.

`import numpy as npimport pandas as pdimport matplotlib.pyplot as plt# Prepare Datadf = pd.read_csv("./python/matplotlib-data/economics.csv", parse_dates=['date']).head(100)x = np.arange(df.shape[0])y_returns = (df.psavert.diff().fillna(0)/df.psavert.shift(1)).fillna(0) * 100# Plotplt.figure(figsize=(16,10), dpi= 80)plt.fill_between(x[1:], y_returns[1:], 0, where=y_returns[1:] >= 0, facecolor='green', interpolate=True, alpha=0.7)plt.fill_between(x[1:], y_returns[1:], 0, where=y_returns[1:] <= 0, facecolor='red', interpolate=True, alpha=0.7)# Annotateplt.annotate('Peak \n1975', xy=(94.0, 21.0), xytext=(88.0, 28),             bbox=dict(boxstyle='square', fc='firebrick'),             arrowprops=dict(facecolor='steelblue', shrink=0.05), fontsize=15, color='white')# Decorationsxtickvals = [str(m)[:3].upper()+"-"+str(y) for y,m in zip(df.date.dt.year, df.date.dt.month_name())]plt.gca().set_xticks(x[::6])plt.gca().set_xticklabels(xtickvals[::6], rotation=90, fontdict={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline'})plt.ylim(-35,35)plt.xlim(1,100)plt.title("Month Economics Return %", fontsize=22)plt.ylabel('Monthly returns %')plt.grid(alpha=0.5)plt.show()`

# Ranking

Ranking indicates that the data has obvious ranks, and ordered bar charts, lollipop charts and dumbbell charts are recommended

## ordered bar chart

An ordered bar chart effectively communicates the ranking order of items. It is a classic method of visualization. The highlight is to add the value of the indicator above the chart, and the user can get accurate information from the chart itself. And the text below has been color-graded and can be learned.

`# Prepare Datadf_raw = pd.read_csv("./python/matplotlib-data/mpg_ggplot2.csv")df = df_raw[['cty', 'manufacturer']].groupby('manufacturer').apply(lambda x: x.mean())df.sort_values('cty', inplace=True)df.reset_index(inplace=True)# Draw plotimport matplotlib.patches as patchesfig, ax = plt.subplots(figsize=(16,10), facecolor='white', dpi= 80)ax.vlines(x=df.index, ymin=0, ymax=df.cty, color='firebrick', alpha=0.7, linewidth=20)# Annotate Textfor i, cty in enumerate(df.cty):    ax.text(i, cty+0.5, round(cty, 1), horizontalalignment='center')# Title, Label, Ticks and Ylimax.set_title('Bar Chart for Highway Mileage', fontdict={'size':22})ax.set(ylabel='Miles Per Gallon', ylim=(0, 30))plt.xticks(df.index, df.manufacturer.str.upper(), rotation=60, horizontalalignment='right', fontsize=12)# Add patches to color the X axis labelsp1 = patches.Rectangle((.57, -0.005), width=.33, height=.13, alpha=.1, facecolor='green', transform=fig.transFigure)p2 = patches.Rectangle((.124, -0.005), width=.446, height=.13, alpha=.1, facecolor='red', transform=fig.transFigure)fig.add_artist(p1)fig.add_artist(p2)plt.show()`

## dumbbell illustration

The dumbbell diagram conveys the maximum and minimum positions of various items and the ranking order of the items.

`import matplotlib.lines as mlines# Import Datadf = pd.read_csv("./python/matplotlib-data/health.csv")df.sort_values('pct_2014', inplace=True)df.reset_index(inplace=True)# Func to draw line segmentdef newline(p1, p2, color='black'):    ax = plt.gca()    l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color='skyblue')    ax.add_line(l)    return l# Figure and Axesfig, ax = plt.subplots(1,1,figsize=(14,14), facecolor='#f7f7f7', dpi= 80)# Vertical Linesax.vlines(x=.05, ymin=0, ymax=26, color='black', alpha=1, linewidth=1, linestyles='dotted')ax.vlines(x=.10, ymin=0, ymax=26, color='black', alpha=1, linewidth=1, linestyles='dotted')ax.vlines(x=.15, ymin=0, ymax=26, color='black', alpha=1, linewidth=1, linestyles='dotted')ax.vlines(x=.20, ymin=0, ymax=26, color='black', alpha=1, linewidth=1, linestyles='dotted')# Pointsax.scatter(y=df['index'], x=df['pct_2013'], s=50, color='#0e668b', alpha=0.7)ax.scatter(y=df['index'], x=df['pct_2014'], s=50, color='#a3c4dc', alpha=0.7)# Line Segmentsfor i, p1, p2 in zip(df['index'], df['pct_2013'], df['pct_2014']):    newline([p1, i], [p2, i])# Decorationax.set_facecolor('#f7f7f7')ax.set_title("Dumbell Chart: Pct Change - 2013 vs 2014", fontdict={'size':22})ax.set(xlim=(0,.25), ylim=(-1, 27), ylabel='Mean GDP Per Capita')ax.set_xticks([.05, .1, .15, .20])ax.set_xticklabels(['5%', '15%', '20%', '25%'])ax.set_xticklabels(['5%', '15%', '20%', '25%'])    plt.show()`

# Distribution

Introducing multivariate histograms, density plots, distribution point plots (newer plotting methods)

Skip the traditional box plot, violin plot

## multivariate histogram

A histogram for a categorical variable shows the frequency distribution of that variable. Visualize the distribution in relation to another categorical variable representing color by coloring the bars.

`# Import Datadf = pd.read_csv("./python/matplotlib-data/mpg_ggplot2.csv")# Prepare datax_var = 'manufacturer'groupby_var = 'class'df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)vals = [df[x_var].values.tolist() for i, df in df_agg]# Drawplt.figure(figsize=(16,9), dpi= 80)colors = [plt.cm.Spectral(i/float(len(vals)-1)) for i in range(len(vals))]n, bins, patches = plt.hist(vals, df[x_var].unique().__len__(), stacked=True, density=False, color=colors[:len(vals)])# Decorationplt.legend({group:col for group, col in zip(np.unique(df[groupby_var]).tolist(), colors[:len(vals)])})plt.title(f"Stacked Histogram of \${x_var}\$ colored by \${groupby_var}\$", fontsize=22)plt.xlabel(x_var)plt.ylabel("Frequency")plt.ylim(0, 40)plt.xticks(ticks=bins, labels=np.unique(df[x_var]).tolist(), rotation=90, horizontalalignment='left')plt.show()`

## density map

Density plots are a commonly used tool for visualizing the distribution of continuous variables. By grouping them by class variables, you can explore relationships and differences between variables.

A density curve with a histogram brings together the collective information conveyed by both plots, allowing them to be both placed in one plot.

`# Import Dataimport seaborn as snsdf = pd.read_csv("./python/matplotlib-data/mpg_ggplot2.csv")# Draw Plotplt.figure(figsize=(13,10), dpi= 80)sns.distplot(df.loc[df['class'] == 'compact', "cty"], color="dodgerblue", label="Compact", hist_kws={'alpha':.7}, kde_kws={'linewidth':3})sns.distplot(df.loc[df['class'] == 'suv', "cty"], color="orange", label="SUV", hist_kws={'alpha':.7}, kde_kws={'linewidth':3})sns.distplot(df.loc[df['class'] == 'minivan', "cty"], color="g", label="minivan", hist_kws={'alpha':.7}, kde_kws={'linewidth':3})plt.ylim(0, 0.35)# Decorationplt.title('Density Plot of City Mileage by Vehicle Type', fontsize=22)plt.legend()plt.show()`

## Distributed point map

Distribution point plots display the univariate distribution of points split by group. The darker the point, the more concentrated the data points in that area. By coloring the medians differently, the true positioning of the groups becomes immediately apparent.

`import matplotlib.patches as mpatches# Prepare Datadf_raw = pd.read_csv("./python/matplotlib-data/mpg_ggplot2.csv")cyl_colors = {4:'tab:red', 5:'tab:green', 6:'tab:blue', 8:'tab:orange'}df_raw['cyl_color'] = df_raw.cyl.map(cyl_colors)# Mean and Median city mileage by makedf = df_raw[['cty', 'manufacturer']].groupby('manufacturer').apply(lambda x: x.mean())df.sort_values('cty', ascending=False, inplace=True)df.reset_index(inplace=True)df_median = df_raw[['cty', 'manufacturer']].groupby('manufacturer').apply(lambda x: x.median())# Draw horizontal linesfig, ax = plt.subplots(figsize=(16,10), dpi= 80)ax.hlines(y=df.index, xmin=0, xmax=40, color='gray', alpha=0.5, linewidth=.5, linestyles='dashdot')# Draw the Dotsfor i, make in enumerate(df.manufacturer):    df_make = df_raw.loc[df_raw.manufacturer==make, :]#    ax.scatter(y=np.repeat(i, df_make.shape[0]), x='cty', data=df_make, s=75, edgecolors='gray', c='w', alpha=0.5)    ax.scatter(y=np.repeat(i, df_make.shape[0]), x=df_make['cty'],  s=75, edgecolors='gray', c='w', alpha=0.5)    ax.scatter(y=i, x='cty', data=df_median.loc[df_median.index==make, :], s=75, c='firebrick')# Annotate    ax.text(33, 13, "\$red \; dots \; are \; the \: median\$", fontdict={'size':12}, color='firebrick')# Decorationsred_patch = plt.plot([],[], marker="o", ms=10, ls="", mec=None, color='firebrick', label="Median")plt.legend(handles=red_patch)ax.set_title('Distribution of City Mileage by Make', fontdict={'size':22})ax.set_xlabel('Miles Per Gallon (City)', alpha=0.7)ax.set_yticks(df.index)ax.set_yticklabels(df.manufacturer.str.title(), fontdict={'horizontalalignment': 'right'}, alpha=0.7)ax.set_xlim(1, 40)plt.xticks(alpha=0.7)plt.gca().spines["top"].set_visible(False)    plt.gca().spines["bottom"].set_visible(False)    plt.gca().spines["right"].set_visible(False)    plt.gca().spines["left"].set_visible(False)   plt.grid(axis='both', alpha=.4, linewidth=.1)plt.show()`

# Composition

Introduction to Waffle Diagrams, Tree Diagrams

## Waffle Figure

A newer plot showing the distribution of variables

`#! pip install pywaffle# Reference: https://stackoverflow.com/questions/41400136/how-to-do-waffle-charts-in-python-square-piechartfrom pywaffle import Waffle# Importdf_raw = pd.read_csv("./python/matplotlib-data/mpg_ggplot2.csv")# Prepare Datadf = df_raw.groupby('class').size().reset_index(name='counts')n_categories = df.shape[0]colors = [plt.cm.inferno_r(i/float(n_categories)) for i in range(n_categories)]# Draw Plot and Decoratefig = plt.figure(    FigureClass=Waffle,    plots={        111: {            'values': df['counts'],            'labels': ["{0} ({1})".format(n[0], n[1]) for n in df[['class', 'counts']].itertuples()],            'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.05, 1), 'fontsize': 12},            'title': {'label': '# Vehicles by Class', 'loc': 'center', 'fontsize':18}        },    },    rows=7,    colors=colors,    figsize=(16, 9))`

## tree diagram

The tree diagram is similar to the pie chart, the advantage is that it does not mislead the contribution of each group, which is better than the pie chart.

`# pip install squarifyimport squarify # Import Datadf_raw = pd.read_csv("./python/matplotlib-data/mpg_ggplot2.csv")# Prepare Datadf = df_raw.groupby('class').size().reset_index(name='counts')labels = df.apply(lambda x: str(x[0]) + "\n (" + str(x[1]) + ")", axis=1)sizes = df['counts'].values.tolist()colors = [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]# Draw Plotplt.figure(figsize=(12,8), dpi= 80)squarify.plot(sizes=sizes, label=labels, color=colors, alpha=.8)# Decorateplt.title('Treemap of Vechile Class')plt.axis('off')plt.show()`

# Change

Mainly display time series data, introduce peak and trough time series and double y-axis time series

## peak and trough time series

Time series plots are used to visualize how a given metric changes over time. The chart below shows how air passenger traffic has changed between 1949 and 1969.

The time series below plots all the peaks and troughs and annotates the occurrence of selected special events.

`# Import Datadf = pd.read_csv('./python/matplotlib-data/AirPassengers.csv')# Get the Peaks and Troughsdata = df['traffic'].valuesdoublediff = np.diff(np.sign(np.diff(data)))peak_locations = np.where(doublediff == -2)[0] + 1doublediff2 = np.diff(np.sign(np.diff(-1*data)))trough_locations = np.where(doublediff2 == -2)[0] + 1# Draw Plotplt.figure(figsize=(16,10), dpi= 80)plt.plot('date', 'traffic', data=df, color='tab:blue', label='Air Traffic')plt.scatter(df.date[peak_locations], df.traffic[peak_locations], marker=mpl.markers.CARETUPBASE, color='tab:green', s=100, label='Peaks')plt.scatter(df.date[trough_locations], df.traffic[trough_locations], marker=mpl.markers.CARETDOWNBASE, color='tab:red', s=100, label='Troughs')# Annotatefor t, p in zip(trough_locations[1::5], peak_locations[::3]):    plt.text(df.date[p], df.traffic[p]+15, df.date[p], horizontalalignment='center', color='darkgreen')    plt.text(df.date[t], df.traffic[t]-35, df.date[t], horizontalalignment='center', color='darkred')# Decorationplt.ylim(50,750)xtick_location = df.index.tolist()[::6]xtick_labels = df.date.tolist()[::6]plt.xticks(ticks=xtick_location, labels=xtick_labels, rotation=90, fontsize=12, alpha=.7)plt.title("Peak and Troughs of Air Passengers Traffic (1949 - 1969)", fontsize=22)plt.yticks(fontsize=12, alpha=.7)# Lighten bordersplt.gca().spines["top"].set_alpha(.0)plt.gca().spines["bottom"].set_alpha(.3)plt.gca().spines["right"].set_alpha(.0)plt.gca().spines["left"].set_alpha(.3)plt.legend(loc='upper left')plt.grid(axis='y', alpha=.3)plt.show()`

## Dual y-axis time series

If you want to show two time series measuring two different quantities at the same point in time, you can plot the second series on the second Y-axis on the right.

`# Import Datadf = pd.read_csv("./python/matplotlib-data/economics.csv")x = df['date']y1 = df['psavert']y2 = df['unemploy']# Plot Line1 (Left Y Axis)fig, ax1 = plt.subplots(1,1,figsize=(16,9), dpi= 80)ax1.plot(x, y1, color='tab:red')# Plot Line2 (Right Y Axis)ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axisax2.plot(x, y2, color='tab:blue')# Decorations# ax1 (left Y axis)ax1.set_xlabel('Year', fontsize=20)ax1.tick_params(axis='x', rotation=0, labelsize=12)ax1.set_ylabel('Personal Savings Rate', color='tab:red', fontsize=20)ax1.tick_params(axis='y', rotation=0, labelcolor='tab:red' )ax1.grid(alpha=.4)# ax2 (right Y axis)ax2.set_ylabel("# Unemployed (1000's)", color='tab:blue', fontsize=20)ax2.tick_params(axis='y', labelcolor='tab:blue')ax2.set_xticks(np.arange(0, len(x), 60))ax2.set_xticklabels(x[::60], rotation=90, fontdict={'fontsize':10})ax2.set_title("Personal Savings Rate vs Unemployed: Plotting in Secondary Y Axis", fontsize=22)fig.tight_layout()plt.show()`

# Groups

Introduction to dendrograms and cluster plots

## Tree

`import scipy.cluster.hierarchy as shc# Import Datadf = pd.read_csv('./python/matplotlib-data/USArrests.csv')# Plotplt.figure(figsize=(16, 10), dpi= 80)  plt.title("USArrests Dendograms", fontsize=22)  dend = shc.dendrogram(shc.linkage(df[['Murder', 'Assault', 'UrbanPop', 'Rape']], method='ward'), labels=df.State.values, color_threshold=100)  plt.xticks(fontsize=12)plt.show()`

## Cluster diagram

Cluster plots can be used to divide points belonging to the same cluster, and can use the first to principal components as the X-axis and Y-axis.

`from sklearn.cluster import AgglomerativeClusteringfrom scipy.spatial import ConvexHull# Import Datadf = pd.read_csv('./python/matplotlib-data/USArrests.csv')# Agglomerative Clusteringcluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')  cluster.fit_predict(df[['Murder', 'Assault', 'UrbanPop', 'Rape']])  # Plotplt.figure(figsize=(14, 10), dpi= 80)  plt.scatter(df.iloc[:,0], df.iloc[:,1], c=cluster.labels_, cmap='tab10')  # Encircledef encircle(x,y, ax=None, **kw):    if not ax: ax=plt.gca()    p = np.c_[x,y]    hull = ConvexHull(p)    poly = plt.Polygon(p[hull.vertices,:], **kw)    ax.add_patch(poly)# Draw polygon surrounding vertices    encircle(df.loc[cluster.labels_ == 0, 'Murder'], df.loc[cluster.labels_ == 0, 'Assault'], ec="k", fc="gold", alpha=0.2, linewidth=0)encircle(df.loc[cluster.labels_ == 1, 'Murder'], df.loc[cluster.labels_ == 1, 'Assault'], ec="k", fc="tab:blue", alpha=0.2, linewidth=0)encircle(df.loc[cluster.labels_ == 2, 'Murder'], df.loc[cluster.labels_ == 2, 'Assault'], ec="k", fc="tab:red", alpha=0.2, linewidth=0)encircle(df.loc[cluster.labels_ == 3, 'Murder'], df.loc[cluster.labels_ == 3, 'Assault'], ec="k", fc="tab:green", alpha=0.2, linewidth=0)encircle(df.loc[cluster.labels_ == 4, 'Murder'], df.loc[cluster.labels_ == 4, 'Assault'], ec="k", fc="tab:orange", alpha=0.2, linewidth=0)# Decorationsplt.xlabel('Murder'); plt.xticks(fontsize=12)plt.ylabel('Assault'); plt.yticks(fontsize=12)plt.title('Agglomerative Clustering of USArrests (5 Groups)', fontsize=22)plt.show()`

# Summarize

This article only shows part of the TOP 50 common drawings, please check the complete code directly on the website:

https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/

image-20230325234912188

Tags: Python

Posted by yoki on Sun, 02 Apr 2023 06:32:58 +0930