Skip to content

Working with Dataframes in Pandas

mattgillucl edited this page Feb 4, 2021 · 7 revisions

Documentation

The pertinent chapter in the Pandas documentation.

Nomenclature

Pandas Dataframes have:

  • Index -> the row label or number
  • Columns -> the column label or number
  • Data -> the cell value

Let's explore!

import pandas as pd
import numpy as np

# data from 
# https://data.england.nhs.uk/dataset/phe-indicator-22303/resource/e8a04f32-81be-459c-bb24-c404946d7b02
df = pd.read_csv('https://data.england.nhs.uk/dataset/'
    'd296bb49-f36a-48c2-8aac-74cf3f0d09e5/resource/e8a04f32-81be-459c-bb24-c404946d7b02/'
    'download/223iiiselfreportedwellbeingpeoplewithalowhappinessscore.data.csv')

# some information about the data
df.index   # the rows
df.columns # the columns

# Selecting data using [] (the indexing operator)
# Can't be used to select row and column simultaneously

c = df['Age']  # single column
type(c)        # returns Series

c = df[['Area Code', 'Value']]  # multiple columns
type(c)                         # returns Dataframe

c = df[['Age']] # single column
type(c)         # Dataframe

# Use .loc to select subset of rows and/or columns, using label

c = df.loc[0] # single row
type(c)       # returns _Series_

c = df.loc[[0, 1, 2, 3]] # multiple rows
type(c)                  # returns Dataframe

c = df.loc[0:4] # multiple rows using slice...(includes row 4!)
type(c) # returns Dataframe

c = df.loc[[0,1,4], ['Area Code', 'Age', 'Value']]  # subset of rows and columns
type(c) # Dataframe

c = df.loc[[1,2,3], :] # select all columns using colon
df.loc[[1,2,3]] # equivalent

# Make selection using Integer LOCation

df.iloc[[1,2,3], [0,1,4]] # selection using position in table

df.iloc[1:3] # turns rows 1 & 2, *not* 3 (cf. df.loc[1:3])


# Weird behaviour
df['Age'] # column in dataframe
df[0] # error! can't do this
df[0:5] # the rows of the dataframe!
# so...always use .loc or .iloc!


# dot notation/attribute
c = df.Age # select column of dataframe
type(c) # series

# can't use dot operator for columns with spaces

# boolean indexing

# Using the boolean indexing with [] operator returns rows, not columns (see above)
df1 = df.head(10)
df1[[True, False, True, False, True, False, True, False, True, False]] # boolean index in list

# Using []:
# 1. string - return column as Series
# 2. list of strings - return columns as Dataframe
# 3. slice - select rows
# 4. booleans - return True rows

# boolean index using comparison (returns Series)
bi = df['Value'] > 15
type(bi)

# select True rows
df[bi]

# or
df[df['Value'] > 15]

# multiple comparisons, instead of and, or, not use & | ~
# remember parentheses around each comparisons!!
df[(df['Lower CI limit'] > 10) & (df['Upper CI limit'] < 15)]

# clearer
cmp1 = df['Lower CI limit'] > 10
cmp2 = df['Upper CI limit'] < 15
df[cmp1 & cmp2]

cmp1 = df['Value'] < 15 
cmp2 = df['Value'].notna()
df[~cmp1 & cmp2]

# using .loc (allows both row and column selection)
# df.loc[row, column]
cmp1 = df['Lower CI limit'] > 10
cmp2 = df['Upper CI limit'] < 15
df.loc[cmp1 & cmp2, ['Age', 'Area Name']]

# note: .iloc doesn't work with boolean selection

# assigning values to dataframe

# add a new column
df['NewColumn'] = 0 # all rows get the same value

# random number
df['NewColumn'] = np.random.randint(1, 5, size=len(df))

# from other columns
df['NewColumn'] = df['Upper CI limit'] - df['Lower CI limit']

# assign to all rows in a column using .loc
df.loc[:, 'NewColumn'] = np.random.randint(0, 100, size=len(df))

# assign to some rows in a column
df_old = df.copy()
df.loc[df['NewColumn'] % 2 == 0, 'Category'] = 'Okay'
df.loc[df['NewColumn'] % 2 != 0, 'Category'] = 'Not Okay'

# careful when using chained indexing
df1 = df.head(10)
df1[0:5]['Category']

# this won't work
df.loc[df['NewColumn'] > 95]['Category']
df.loc[df['NewColumn'] > 95]['Category'] = 'Selected'
df.loc[df['NewColumn'] > 95]['Category']

# this will work
df.loc[df['NewColumn'] > 95, 'Category'] = 'Selected'
df.loc[df['NewColumn'] > 95]['Category']

# but, warning!
df['Category'][df['NewColumn'] > 95] = 'SelectedAgain'
df.loc[df['NewColumn'] > 95]['Category']

# because getting a column from dataframe using [] creates a view, not a copy
# always use a single .loc statement to get the rows and columns you want, and then assign
Clone this wiki locally