Python数据可视化 | Visualization tricks using Seaborn (2)

原创于 2020-02-08 11:29:07 发布 · 696 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

Data Visualization 专栏收录该内容

5 篇文章

订阅专栏

本文介绍使用Seaborn库进行数据可视化的多种方法，涵盖不同类型的变量关系展示，包括联合分布、核密度估计、条形图、时间序列及回归分析等。

Visualization tricks using Seaborn (2)

In the process of making visual charts, we often need to deal with the relationship between numeric variables(N) and category variables©. Somethings we need to deal with time series, too. In this article, I will introduce these visualization methods.

14 jointplot for 2N-regression

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="darkgrid")

data={'income':[123,144,168,171,180,185,187,189,190,193,194,195,199,202,203,204,210,222,224,228,
               229,230,231,231,234,238,239,242,244,245,250,253,254,258,260,278,292,313,346,357],
     'spent':[88,110,102,119,144,92,138,131,130,166,156,159,185,113,144,159,159,166,178,179,
             181,168,177,190,188,185,177,171,189,202,198,192,201,194,204,219,244,241,323,286]}
data=pd.DataFrame(data)
g = sns.jointplot("income", "spent", data=data, kind="reg",
                  xlim=(100, 400), ylim=(80, 350), color="m", height=7)

在这里插入图片描述

15 kdeplot for 1C1N

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Create the data

rs = np.random.RandomState(1979)
x = rs.randn(500)
g = np.tile(list("ABCDEFGHIJ"), 50)
df = pd.DataFrame(dict(x=x, g=g))
m = df.g.map(ord)
df["x"] += m

# Initialize the FacetGrid object

pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(df, row="g", hue="g", aspect=15, height=.5, palette=pal)

# Draw the densities in a few steps

g.map(sns.kdeplot, "x", clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)
g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw=.2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)

# Define and use a simple function to label the plot in axes coordinates

def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color,
            ha="left", va="center", transform=ax.transAxes)


g.map(label, "x")
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.25)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

在这里插入图片描述

16 stripplot for 2C1N

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")

data={'location':['north']*30+['south']*30+['east']*30+['west']*30,
     'type':(['A']*10+['B']*10+['C']*10)*4,
     'value':[44,5,3,5,4,6,7,3,5,4,6,8,5,5,5,3,4,7,4,5,4,7,5,13,5,4,6,8,8,5,5,4,3,5,2,7,6,5,4,5,
             4,6,7,15,3,6,3,6,6,4,5,7,5,6,8,8,6,4,5,7,3,6,5,5,5,3,6,3,5,5,35,4,2,2,6,5,6,7,8,6,
             5,6,3,6,3,6,8,9,7,7,6,4,15,43,6,4,6,3,5,7,3,7,6,6,5,7,5,5,7,7,65,7,4,5,4,6,7,8,7,5]}
data=pd.DataFrame(data)
# Initialize the figure
f, ax = plt.subplots(figsize=(11,11))
sns.despine(bottom=True, left=True)
# Show each observation with a scatterplot

sns.stripplot(x="value", y="location", hue="type",
              data=data, dodge=True, jitter=True,
              alpha=.25, zorder=1)
# Show the conditional means

sns.pointplot(x="value", y="location", hue="type",
              data=data, dodge=.532, join=False, palette="dark",
              markers="d", scale=.75, ci=None)
# Improve the legend 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[3:], labels[3:], title="type",
          handletextpad=0, columnspacing=1,
          loc="lower right", ncol=3, frameon=True)

在这里插入图片描述

17 replot for 1C2N time series

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")

# Load the example mpg dataset

mpg = sns.load_dataset("mpg")

data={'city':['Ubn']*10+['Rsk']*10+['Pcd']*10+['Grt']*10+['Rbn']*10+['Yml']*10+['Gcr']*10+['Ati']*10,
     'year':[1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,
            1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,
            1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,
            1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10],
     'ave_eco':[23,24,26,33,36,45,58,70,79,90,22,26,36,39,47,58,68,79,90,101,
               18,20,22,27,31,44,51,56,63,74,11,12,14,16,25,34,37,42,47,51,
               14,18,22,23,29,35,45,51,56,63,32,37,46,57,70,79,86,100,112,123,
               10,15,19,22,24,28,34,40,46,58,12,15,19,24,34,44,53,59,60,66],
     'population':[36,37,38,39,42,44,45,46,47,49,51,52,52,54,55,57,58,59,61,62,
                  98,92,94,95,108,119,121,122,133,135,144,149,155,169,173,176,182,191,194,199,
                  23,25,27,29,31,33,35,37,44,48,51,52,54,56,57,57,58,58,59,61,
                  22,21,22,24,30,36,37,38,39,44,46,49,51,54,56,58,62,69,77,86]}
data=pd.DataFrame(data)
# Plot miles per gallon against horsepower with other semantics

sns.relplot(x="year", y="ave_eco", hue="city", size="population",
            sizes=(10, 500), alpha=.5, palette="muted",
            height=6, data=data)

在这里插入图片描述

18 lineplot for 2C1N time series


# prepare the data

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
sns.set(style="darkgrid")

data={'time':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
             1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
              1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
              1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
     'type':['strong']*30+['weak']*30,
     'shape':(['dark']*15+['light']*15)*2,
     'value':[5,6,3,5,5,3,4,4,5,6,7,8,5,9,11,
             4,5,3,5,6,3,2,4,6,6,7,8,10,7,5,
             3,4,5,2,4,5,3,4,3,5,6,7,8,8,9,
             2,1,4,3,4,4,5,5,6,3,3,5,5,7,7]}
data=pd.DataFrame(data)

# Plot

plt.subplots(figsize=(11,11))
sns.lineplot(x="time", y="value",
             hue="type", style="shape",
             data=data)

在这里插入图片描述

19 relplot for 2C1N time series

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set(style="ticks")

data={'type':['North']*100+['South']*100,
      'category':(['A']*20+['B']*20+['C']*20+['D']*20+['E']*20)*2,
      'time':['01','02','03','04','05','06','07','08','09','10',
              '11','12','13','14','15','16','17','18','19','20']*10,
      'rate':[40,41,46,42,47,46,57,63,55,64,67,62,57,45,37,22,29,19,27,28,
             40,41,44,47,49,55,52,56,57,56,67,66,53,41,22,35,36,30,31,33,
             40,41,42,49,43,45,55,58,67,66,69,56,55,51,35,39,24,21,28,37,
             40,41,42,46,44,52,56,67,68,69,65,63,66,40,20,25,28,44,34,35,
             40,41,41,46,44,42,51,58,67,58,60,64,62,45,34,33,37,23,26,29,
             40,42,41,45,47,47,55,57,57,65,64,67,64,55,24,29,24,39,33,34,
             40,42,47,52,45,56,60,64,69,65,68,64,69,55,41,35,37,33,32,35,
             40,41,43,44,47,57,66,61,64,67,69,61,54,51,30,33,22,29,28,25,
             40,41,44,48,49,51,52,59,53,68,62,64,56,44,29,33,32,30,38,31,
             40,41,49,48,50,41,52,54,68,62,66,64,60,44,34,39,32,30,33,38,]
}
data=pd.DataFrame(data)

# Define a palette to ensure that colors will be

palette = dict(zip(data.category.unique(),
                   sns.color_palette("rocket_r", 6)))

# Plot the lines on two facets

sns.relplot(x="time", y="rate",
            hue="category", col="type",
            palette=palette,
            height=8, aspect=.75, facet_kws=dict(sharex=False),
            kind="line", legend="full", data=data)

在这里插入图片描述

20 scatterplot for 1C3N

# prepare the data

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
sns.set(style="whitegrid")
data={'size':[1,1.2,1.3,1,1.1,1,1.5,1,1.7,1.7,1.9,1.9,2,2,2.2,2.3,2.5,2.5,2.6,2.7,
             2.9,2.9,3,3.1,3.3,3.4,3.5,3.7,3.7,4.1,4.2,4.4,4.4,4.7,5.7,5.9,5.9,6,6.6,6.9],
     'price':[13,15,16,14,16,17,18,18,17,19,18,29,28,27,34,35,36,37,36,33,
             23,24,45,46,41,42,46,43,47,50,51,53,56,57,67,67,63,76,87,89],
     'huge':[8,8,9,8,9,10,11,11,9,10,11,14,15,7,11,9,8,10,9,9,
            10,11,8,8,7,9,10,10,12,8,9,10,13,12,9,8,10,10,11,11],
     'quality':['D','B','B','C','C','B','A','A','E','C','D','A','B','D','D','C','E','A','F','G',
               'G','D','B','C','C','D','B','E','B','F','E','D','C','C','B','B','E','A','C','A']}
data=pd.DataFrame(data)

# plot

f, ax = plt.subplots(figsize=(11, 11))
sns.despine(f, left=True, bottom=True)
clarity_ranking = ['A','B','C','D','E','F','G']
sns.scatterplot(x="size", y="price",
                hue="quality", size="huge",
                palette="ch:r=-.2,d=.3_r",
                hue_order=clarity_ranking,
                sizes=(50, 400), linewidth=0,
                data=data, ax=ax)

在这里插入图片描述

21 distplot for distributions

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
sns.set(style="white",palette="muted",color_codes=True)
rs=np.random.RandomState(10)
data=rs.normal(size=1000)

# plot

f,axes=plt.subplots(2,2,figsize=(11,11),sharex=True)
sns.despine(left=True)
sns.distplot(data,kde=False,color='purple',ax=axes[0,0])
sns.distplot(data,hist=False,color='red',ax=axes[0, 1])
sns.distplot(data,hist=False, color="blue", kde_kws={"shade": True}, ax=axes[1, 0])
sns.distplot(data,rug=True,color="gold", ax=axes[1, 1])
plt.setp(axes, yticks=[])
plt.tight_layout()

'''
rug: density below the graph (default: False)
kde: density line (default:True)
hist: histgram (default:True)
bins: the number of histgram bar
'''

在这里插入图片描述

22 lmplot for 1C2N-regression

# prepare the data

import pandas as pd
data={'dataset':['A']*10+['B']*10+['C']*10+['D']*10,
     'x':[1,2,3,4,6,7,5,7,4,7,
         1,3,5,6,2,4,5,3,4,5,
         3,4,5,1,3,6,7,9,7,9,
         8,5,3,6,5,4,3,2,1,1],
     'y':[1,2,3,4,5,6,7,8,9,10,
         1,2,3,4,5,6,7,8,9,10,
         1,2,3,4,5,6,7,8,9,10,
         1,2,3,4,5,6,7,8,9,10]}
data=pd.DataFrame(data)

# plot

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")
# Show the results of a linear regression within each dataset

sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=data,
           col_wrap=2, ci=None, palette="muted", height=4,
           scatter_kws={"s": 50, "alpha": 1})
'''
col_wrap=2 means two subplots a line
"s" means scatter size
'''
plt.show()

在这里插入图片描述

23 implot for 1C2N-regression

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

# Load the iris dataset

iris = sns.load_dataset("iris")
data={'species':['A']*10+['B']*10+['C']*10+['D']*10,
     'x':[13,14,16,18,19,22,23,25,26,27,9,11,11,14,14,15,16,17,19,20,
         13,15,16,16,16,18,18,26,29,35,20,21,22,23,28,29,29,30,30,31],
     'y':[15,16,19,13,10,22,29,24,19,25,22,26,29,22,21,34,39,44,30,45,
         15,16,15,14,16,11,11,10,22,20,22,25,21,19,22,28,28,24,25,30]}
data=pd.DataFrame(data)

g = sns.lmplot(x="x", y="y", hue="species",
               truncate=True, height=5, data=data)
g.set_axis_labels("x", "y")

在这里插入图片描述

24 facetgrid for 1C2N

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="ticks")

# Create a dataset with many short random walks

data={'step':[1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,
             1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,
             1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,
             1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,],
     'position':[3,4,5,6,7,3,5,4,2,6,4,3,3,2,6,6,4,3,5,6,4,3,2,5,6,
                3,7,6,6,4,4,5,6,3,5,5,6,7,4,5,6,5,4,3,4,4,5,6,3,4,
                4,5,6,3,5,5,6,7,5,4,2,4,5,3,5,6,4,5,3,6,5,4,4,5,6,
                3,5,4,5,3,5,6,4,5,6,3,5,5,4,6,6,5,4,6,5,5,6,3,6,3],
     'category':['01']*5+['02']*5+['03']*5+['04']*5+['05']*5+
             ['06']*5+['07']*5+['08']*5+['09']*5+['10']*5+
             ['11']*5+['12']*5+['13']*5+['14']*5+['15']*5+
             ['16']*5+['17']*5+['18']*5+['19']*5+['20']*5}
data=pd.DataFrame(data)

# Initialize a grid of plots with an Axes for each walk

grid = sns.FacetGrid(data, col="category", hue="step", palette="tab20c",
                     col_wrap=4, height=1.5)
# Draw a line plot to show the trajectory of each random walk

grid.map(plt.plot, "step", "position", marker="o")
grid.set(xlim=(-.5, 5.5), ylim=(1, 7))
grid.fig.tight_layout(w_pad=1)

在这里插入图片描述

25 kdeplot for distributions

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="dark")
rs = np.random.RandomState(50)
f, axes = plt.subplots(4, 4, figsize=(12, 12), sharex=True, sharey=True)
# Rotate the starting point around the cubehelix hue circle

for ax, s in zip(axes.flat, np.linspace(0, 3, 16)):

    # Create a cubehelix colormap to use with kdeplot
    
    cmap = sns.cubehelix_palette(start=s, light=1, as_cmap=True)
    # Generate and plot a random bivariate dataset
    
    x, y = rs.randn(2, 50)
    sns.kdeplot(x, y, cmap=cmap, shade=True, cut=10, ax=ax)
    ax.set(xlim=(-3, 3), ylim=(-3, 3))
f.tight_layout()

在这里插入图片描述

26 combined plot A

import seaborn as sns
sns.set(style="white")

df = sns.load_dataset("iris")

g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot)
g.map_upper(sns.scatterplot)
g.map_diag(sns.kdeplot, lw=3)

在这里插入图片描述

27 combined plot B

import seaborn as sns
sns.set(style="ticks")

df = sns.load_dataset("iris")
sns.pairplot(df, hue="species")

在这里插入图片描述