Imports Libraries

#collapse-hide

# ignore library warnings
import warnings
warnings.filterwarnings('ignore')

# data manipulation
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

sns.set_style('darkgrid')

Import Datasets

tips_df = sns.load_dataset('tips')
tips_df.sample(5)
total_bill tip sex smoker day time size
63 18.29 3.76 Male Yes Sat Dinner 4
47 32.40 6.00 Male No Sun Dinner 4
238 35.83 4.67 Female No Sat Dinner 3
34 17.78 3.27 Male No Sat Dinner 2
210 30.06 2.00 Male Yes Sat Dinner 3
tips_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB

distplot


correlation between Tip and Total bill.

What is the probability of getting a higher tip based on the total bill?

sns.relplot(x='total_bill', y='tip', data=tips_df)
plt.show()

we can see that there is a linear relationship between the Total bill and Tip i.e when the total bill increases, is likely for the tip given to be higher.

Let's add a hue='smoker' to get more insights on who gives better tip between a smoker and non-smoker.

sns.relplot(data=tips_df, x='total_bill', y='tip', hue='smoker')
plt.show()

Note We can see that there are more non-smokers and only a few tips are above 6 and that's when the total bill exceeds $30.

So how may smokers are there?

tips_df.smoker.value_counts()
No     151
Yes     93
Name: smoker, dtype: int64

We can change the hue to sex, size, time and check drill through to understand tip based on gender[male, female] and time[lunch or Dinner]

sns.relplot(data=tips_df, x='total_bill', y='tip', hue='smoker', style='time')
plt.show()
sns.relplot(data=tips_df, x='total_bill', y='tip', hue='size')
plt.show()

Note: As size increases the tip increases

setting paletes

  • palette='ch:r=0.7, l=0.85'

#collapse-show

sns.relplot(data=tips_df, x='total_bill', y='tip', hue='size', palette='ch:r=0.7, l=0.85')
plt.show()

Now lets set size based on size

  • try to set style='size' and you will see the pollygeons on the charts .
  • alternatively if you want to change the size of the ponts based on the descrete values.
  • set size='Size'
sns.relplot(data=tips_df, x='total_bill', y='tip', size='size')
plt.show()

Lets normalize the data points based on parameters

# min=15
# max=200
sns.relplot(data=tips_df, x='total_bill', y='tip', size='size', sizes=(15,200))
plt.show()

SubPlots for Scatter Plots/ relplots
It wont always be easy to convey the information in one plot. So we introduce subplots to help clear things out.

sns.relplot(data=tips_df, x='total_bill', y='tip', hue='smoker', col='time')
<seaborn.axisgrid.FacetGrid at 0x7f9a075a5080>

We can now see the plots categorized in two ways.

  1. outer category Time
  2. inner category smoker

This is beautiful and easy to digest! Now, lets try to interprete the plots.

  • During Dinner time there are more people than Lunch time, right?
    • So we can say more people go out for dinner than they do during lunch time.
  • When can one get better Tip?
    • better tips are more possible during dinner time
  • who are those people?
    • both smokers and non-smokers.
  • what's their gender?
    • Let's find out!


Tip:row and col can help you acheive subplots for different categories. Let's see it in action by adding row=sex

sns.relplot(data=tips_df, x='total_bill', y='tip', hue='smoker', col='time', row='sex',height=3.5)
<seaborn.axisgrid.FacetGrid at 0x7f9a06904978>

Ooh! Men like to impress during Dinner dates, i wonder what is the size of the meal LOL.
Check the total Bill for meles during dinner time compared to Lunch time LOL.

sns.relplot(data=tips_df, x='total_bill', y='tip', hue='smoker', col='size' , col_wrap=3, height=3.5)
<seaborn.axisgrid.FacetGrid at 0x7f9a052660b8>
sns.scatterplot(data=tips_df, x='total_bill', y='tip', hue='sex')
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a04eedac8>

Line Plot

# generate toy dataset
np.random.seed(1996)
df = pd.DataFrame(dict(time= np.arange(500), value= np.random.randn(500).cumsum()))
df.head()
time value
0 0 -0.572858
1 1 -0.418251
2 2 1.471370
3 3 0.857620
4 4 0.664801
sns.relplot(kind='line',data=df, x='time', y='value')
plt.show()

Tip: The value decrease with time.

i.e Here we focus more on the trend of the data.


Lets load a new dataset from seaborn called fmri

fmri_df = sns.load_dataset('fmri')
fmri_df.sample(5)
subject timepoint event region signal
108 s4 11 stim parietal -0.128547
443 s9 12 stim frontal -0.083663
1033 s10 7 cue frontal -0.047084
1047 s10 10 cue frontal -0.016124
87 s11 12 stim parietal -0.178510

Lets plot a line plot for timepoint with signal

Tip: We can also use sns.relplot(data=fmri_df, x=&#8217;timepoint&#8217;, y=&#8217;signal&#8217;)

The zones are called confidence interval.
we can set the on and off with ci=False

sns.lineplot(data=fmri_df, x='timepoint', y='signal')
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a0750c080>
sns.lineplot(data=fmri_df, x='timepoint', y='signal', ci=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a07417c88>

Note: Lets plot the line plot with hue to separate the line based on categories hue=&#8217;event&#8217; and another one with hue=&#8217;region&#8217;

sns.lineplot(data=fmri_df, x='timepoint', y='signal', hue='event', ci=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a075b4dd8>
sns.lineplot(data=fmri_df, x='timepoint', y='signal', hue='region', ci=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a076f15c0>

Tip: Keep your plots simple and easy to understand. Let the plots tell a story while you elaborate on the facts.

Let's add markers to our line plots to better understand the ponts
Important: markers will only work if you are using style on your plots

style='region

dashes=False

markers=True
sns.lineplot( data=fmri_df, x='timepoint', y='signal', style='region', hue='region' , markers= True, ci=False, dashes=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a08086f98>

Tip: hue values should not exceed 4 for a line plot. This will make its clean and readable. I do not like dashes on line plots!
Important: Palettes are a great tool to control the color scheme of your plots. sns.cubehelix_palette(light=0.5, n_colors=6)

# set palettes 
palettes = sns.cubehelix_palette(light=0.5, n_colors=2)
sns.lineplot( data=fmri_df, x='timepoint', y='signal', style='region', hue='region', ci=False, dashes=False, palette=palettes)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a076fdc50>

Visualizing Categorical Data


  1. Univariate
    • countplot(data, x)
  2. Bi/Multi-Variate --> Categorical vs Numeric (descrete)
    • catplot(data, x, y)
    • boxplot(data, x, y)
    • barplot(data, x, y)
    • pointplot(data, x, y)
sns.catplot(data=tips_df, x='day', y='total_bill', hue='sex',  col='time', col_wrap=3, height=3.5, jitter=False)
<seaborn.axisgrid.FacetGrid at 0x7f9a05db5eb8>
sns.catplot(data=tips_df, x='smoker', y='tip', order=['No','Yes'])
<seaborn.axisgrid.FacetGrid at 0x7f9a04178470>
sns.catplot(data=tips_df, x='day', y='total_bill', hue='size', row='time', col='sex',  height=3.5)
<seaborn.axisgrid.FacetGrid at 0x7f9a04978dd8>
sns.catplot(data=tips_df, x='day', y='total_bill', hue='sex', jitter=False )
<seaborn.axisgrid.FacetGrid at 0x7f9a0432fba8>
sns.swarmplot(data=tips_df, x='day', y='total_bill', hue='sex')
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a04217438>

BOX PLOT Understanding the Statistics

tips_df.sample(5)
total_bill tip sex smoker day time size
89 21.16 3.00 Male No Thur Lunch 2
230 24.01 2.00 Male Yes Sat Dinner 4
184 40.55 3.00 Male Yes Sun Dinner 2
61 13.81 2.00 Male Yes Sat Dinner 2
64 17.59 2.64 Male No Sat Dinner 3
sns.boxplot(data=tips_df, x='day', y='total_bill', )
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a040bd4e0>
sns.boxplot(data=tips_df, x='day', y='total_bill', hue='time', dodge=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a0380e2e8>
ax = sns.boxenplot(data=tips_df, x='day', y='total_bill')
ax

Note: Focus on the Mim , Max, Q1, Q2, Q3 and IQR when interpreting BoxPlot

BARPLOT Categorical vs categorical

sns.barplot(data=tips_df, x='sex', y='total_bill', ci=None )
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a03350208>
sns.countplot(data=tips_df, x='smoker', hue='sex' )
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a033bd278>
sns.pointplot(data=tips_df, x='sex', y='size', hue='smoker', ci=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a033a74a8>

Note: Non-smokers have high size compared to smokers

Visualizing Distribution


  1. Univariate

  2. Bi-variate

Univarient Distribution

Important: A histogram represent the distribution of the data by forming bins (groups/clusters) along the range of the data and drawing bars to show the number of obsevation falling in each of these bins.

# generate data
np.random.seed(1996)
x = np.random.randn(100)

# plot x
sns.distplot(x, bins=20, kde=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a02957940>
sns.kdeplot(x, shade=True, bw=1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a02746978>

Bi-Variate Plot

tips_df.head()
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
sns.jointplot(data=tips_df ,x='total_bill', y='tip')
<seaborn.axisgrid.JointGrid at 0x7f9a05316748>
sns.jointplot(x='total_bill', y='tip', data=tips_df, kind='kde', color='r')
<seaborn.axisgrid.JointGrid at 0x7f9a01c8e4e0>
g = sns.jointplot(x='total_bill', y='tip', data=tips_df, kind='kde', color='m')
g.plot_joint(plt.scatter, c='w', s=30, linewidth=1, marker='+')
g.ax_joint.collections[0].set_alpha(0)

Pair-Plot

Important: A pair plot is a very popular start-up plot for a lot of machine leaning EDA. The reason it is popular is that is helps data scientist to get an overview of the distribution or correlation of all the features in one go. This makes it easier to see what features are important or can better explain the Target.
Warning: A pair plot will only plots numerical features from your dataset. Ensure to correct all your data type before using pair plot. df.dtypes

iris = sns.load_dataset('iris')
iris.sample(5)
sepal_length sepal_width petal_length petal_width species
69 5.6 2.5 3.9 1.1 versicolor
107 7.3 2.9 6.3 1.8 virginica
28 5.2 3.4 1.4 0.2 setosa
112 6.8 3.0 5.5 2.1 virginica
127 6.1 3.0 4.9 1.8 virginica

We are now going to plot all the features using pair-plot!

sns.pairplot(iris)
<seaborn.axisgrid.PairGrid at 0x7f9a0238bd68>

Lets modifiy our pair plot abit.

g = sns.PairGrid(iris)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels = 10)
<seaborn.axisgrid.PairGrid at 0x7f9a0200c320>

Linear Regression and Relationship


  1. regplot()
  2. implot()
tips_df.head()
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

What is the linear relationship between total bill and tips?

sns.regplot(data=tips_df, x='total_bill', y='tip')
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a005d85c0>
sns.lmplot(x='size', y='tip', data=tips_df, x_jitter=0.05)
<seaborn.axisgrid.FacetGrid at 0x7f9a005d0588>

lets fix the way the plot is showing, by using the x_estimator = np.mean

sns.lmplot(x='size', y='tip', data=tips_df, x_estimator=np.mean)
<seaborn.axisgrid.FacetGrid at 0x7f9a00206c88>

Lets load a new dataset : to show what to do if data does not have a linear relationship.

data = sns.load_dataset('anscombe')
data.sample(5)
dataset x y
0 I 10.0 8.04
21 II 5.0 4.74
8 I 12.0 10.84
41 IV 8.0 5.56
18 II 4.0 3.10
sns.lmplot( x='x', y='y', data=data.query("dataset == 'I'"), ci=None, scatter_kws={'s':80})
<seaborn.axisgrid.FacetGrid at 0x7f99ff00ac50>
sns.lmplot( x='x', y='y', data=data.query("dataset == 'II'"), ci=None, scatter_kws={'s':80}, order=2)
<seaborn.axisgrid.FacetGrid at 0x7f99ff0572b0>

Tip: To escape outliers use robust=True

sns.lmplot( x='x', y='y', data=data.query("dataset == 'III'"), ci=None, scatter_kws={'s':80}, robust=True)
<seaborn.axisgrid.FacetGrid at 0x7f99fec04470>
sns.lmplot(x='total_bill', y='tip', data=tips_df, hue='sex', markers = ['o', 'x'], col='smoker', row='time')
<seaborn.axisgrid.FacetGrid at 0x7f99fea783c8>
fig, ax = plt.subplots(figsize = (8,4))
sns.regplot(x='total_bill', y='tip', data=tips_df, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f99fe9961d0>

Oontrolling Plotted Figure Aesthetics

  • figure style
  • axes styling
  • color palettes
  • etc.
def sinplot(flip=1):
   x = np.linspace(0,14,100)

   for i in range(1,7):
     plt.plot(x, np.sin(x+i*0.05)*(7-i)*flip)
sinplot(1)
sns.set_style('whitegrid', {'axes.grid': True, 'xtick.direction': 'out'})
sinplot()
sns.despine(left=False, bottom=False)
sns.set_context('paper')
sns.set_style('dark', {'axes.grid': False, 'xtick.direction': 'out'}, )
sinplot()
sns.axes_style()
{'axes.axisbelow': True,
 'axes.edgecolor': '.8',
 'axes.facecolor': 'white',
 'axes.grid': True,
 'axes.labelcolor': '.15',
 'axes.spines.bottom': True,
 'axes.spines.left': True,
 'axes.spines.right': True,
 'axes.spines.top': True,
 'figure.facecolor': 'white',
 'font.family': ['sans-serif'],
 'font.sans-serif': ['Arial',
  'DejaVu Sans',
  'Liberation Sans',
  'Bitstream Vera Sans',
  'sans-serif'],
 'grid.color': '.8',
 'grid.linestyle': '-',
 'image.cmap': 'rocket',
 'lines.solid_capstyle': 'round',
 'patch.edgecolor': 'w',
 'patch.force_edgecolor': True,
 'text.color': '.15',
 'xtick.bottom': False,
 'xtick.color': '.15',
 'xtick.direction': 'out',
 'xtick.top': False,
 'ytick.color': '.15',
 'ytick.direction': 'out',
 'ytick.left': False,
 'ytick.right': False}

How to find the current used color palette

current_palettes = sns.color_palette()
sns.palplot(current_palettes)