-
Notifications
You must be signed in to change notification settings - Fork 7
Working with Dataframes in Pandas
mattgillucl edited this page Feb 4, 2021
·
7 revisions
The pertinent chapter in the Pandas documentation.
Pandas Dataframes have:
- Index -> the row label or number
- Columns -> the column label or number
- Data -> the cell value
import pandas as pd
import numpy as np
# data from
# https://data.england.nhs.uk/dataset/phe-indicator-22303/resource/e8a04f32-81be-459c-bb24-c404946d7b02
df = pd.read_csv('https://data.england.nhs.uk/dataset/'
'd296bb49-f36a-48c2-8aac-74cf3f0d09e5/resource/e8a04f32-81be-459c-bb24-c404946d7b02/'
'download/223iiiselfreportedwellbeingpeoplewithalowhappinessscore.data.csv')
# some information about the data
df.index # the rows
df.columns # the columns
# Selecting data using [] (the indexing operator)
# Can't be used to select row and column simultaneously
c = df['Age'] # single column
type(c) # returns Series
c = df[['Area Code', 'Value']] # multiple columns
type(c) # returns Dataframe
c = df[['Age']] # single column
type(c) # Dataframe
# Use .loc to select subset of rows and/or columns, using label
c = df.loc[0] # single row
type(c) # returns _Series_
c = df.loc[[0, 1, 2, 3]] # multiple rows
type(c) # returns Dataframe
c = df.loc[0:4] # multiple rows using slice...(includes row 4!)
type(c) # returns Dataframe
c = df.loc[[0,1,4], ['Area Code', 'Age', 'Value']] # subset of rows and columns
type(c) # Dataframe
c = df.loc[[1,2,3], :] # select all columns using colon
df.loc[[1,2,3]] # equivalent
# Make selection using Integer LOCation
df.iloc[[1,2,3], [0,1,4]] # selection using position in table
df.iloc[1:3] # turns rows 1 & 2, *not* 3 (cf. df.loc[1:3])
# Weird behaviour
df['Age'] # column in dataframe
df[0] # error! can't do this
df[0:5] # the rows of the dataframe!
# so...always use .loc or .iloc!
# dot notation/attribute
c = df.Age # select column of dataframe
type(c) # series
# can't use dot operator for columns with spaces
# boolean indexing
# Using the boolean indexing with [] operator returns rows, not columns (see above)
df1 = df.head(10)
df1[[True, False, True, False, True, False, True, False, True, False]] # boolean index in list
# Using []:
# 1. string - return column as Series
# 2. list of strings - return columns as Dataframe
# 3. slice - select rows
# 4. booleans - return True rows
# boolean index using comparison (returns Series)
bi = df['Value'] > 15
type(bi)
# select True rows
df[bi]
# or
df[df['Value'] > 15]
# multiple comparisons, instead of and, or, not use & | ~
# remember parentheses around each comparisons!!
df[(df['Lower CI limit'] > 10) & (df['Upper CI limit'] < 15)]
# clearer
cmp1 = df['Lower CI limit'] > 10
cmp2 = df['Upper CI limit'] < 15
df[cmp1 & cmp2]
cmp1 = df['Value'] < 15
cmp2 = df['Value'].notna()
df[~cmp1 & cmp2]
# using .loc (allows both row and column selection)
# df.loc[row, column]
cmp1 = df['Lower CI limit'] > 10
cmp2 = df['Upper CI limit'] < 15
df.loc[cmp1 & cmp2, ['Age', 'Area Name']]
# note: .iloc doesn't work with boolean selection
# assigning values to dataframe
# add a new column
df['NewColumn'] = 0 # all rows get the same value
# random number
df['NewColumn'] = np.random.randint(1, 5, size=len(df))
# from other columns
df['NewColumn'] = df['Upper CI limit'] - df['Lower CI limit']
# assign to all rows in a column using .loc
df.loc[:, 'NewColumn'] = np.random.randint(0, 100, size=len(df))
# assign to some rows in a column
df_old = df.copy()
df.loc[df['NewColumn'] % 2 == 0, 'Category'] = 'Okay'
df.loc[df['NewColumn'] % 2 != 0, 'Category'] = 'Not Okay'
# careful when using chained indexing
df1 = df.head(10)
df1[0:5]['Category']
# this won't work
df.loc[df['NewColumn'] > 95]['Category']
df.loc[df['NewColumn'] > 95]['Category'] = 'Selected'
df.loc[df['NewColumn'] > 95]['Category']
# this will work
df.loc[df['NewColumn'] > 95, 'Category'] = 'Selected'
df.loc[df['NewColumn'] > 95]['Category']
# but, warning!
df['Category'][df['NewColumn'] > 95] = 'SelectedAgain'
df.loc[df['NewColumn'] > 95]['Category']
# because getting a column from dataframe using [] creates a view, not a copy
# always use a single .loc statement to get the rows and columns you want, and then assign
TLO Model Wiki