443-970-2353
[email protected]
CV Resume
Pandas is a commonly used data manipulation library in Python. Data.Table, on the other hand, is among the best data manipulation packages in R. Data.Table is succinct and we can do a lot with Data.Table in just a single line. Further, data.table is, generally, faster than Pandas (see benchmark here) and it may be a go-to package when performance is a constraint. For someone who knows one of these packages, I thought it could help to show codes that perform the same tasks in both packages side by side to help them quickly study the other. If you know either of them and want to learn the other, this blog post is for you.
The data is Hair and Eye Color of Statistics Students from this website.
# Import Library
import pandas as pd
library(data.table)
# Read data from a url
url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/HairEyeColor.csv"
df = pd.read_csv(url)
url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/HairEyeColor.csv"
df = fread(url)
# Type of the df object
type(df)
class(df)
# data.table is data.frame
# Column names
list(df)
names(df)
# Show first few rows
df.head()
head(df)
# Show last few rows
df.tail()
tail(df)
# Data type of each column
df.dtypes
str(df)
# Return number of columns and rows of dataframe
df.shape
dim(df)
# Number of rows
len(df.index)
nrow(df)
# Number of columns
len(df.columns)
ncol(df)
# Basic statistics
df.describe()
summary(df)
# Extract first three rows
df[0:3]
# or
df.iloc[:3]
df[1:3]
# Filter for black hair
df[df['Hair']=="Black"]
# or
df.query("Hair =='Black'")
df[Hair == "Black"]
# or
setkey(df, Hair)
df["Black"]
# Filter for males who have black hair
df[(df['Hair']=="Black") & (df["Sex"]=="Male")]
# or
df.query("Hair == 'Black' & Sex =='Male'")
df[Hair == "Black" & Sex == "Male"]
# or
setkey(df, Hair, Sex)
df[.("Black","Male")]
# Filter for those who have brown eye or black hair
z = df[(df['Hair']=="Black") | (df["Eye"]=="Brown")]
# or
z = df.query("Hair == 'Black' | Eye =='Brown'")
z.head(6)
z = df[Hair== "Black" | Eye == "Brown"]
head(z)
# Filter for eye color of blue, hazel and green
df[df.Eye.isin(['Blue','Hazel','Green'])].head()
z = df[Eye %in%
c('Blue','Hazel',
'Green')]
head(z)
# or
setkey(df, Eye)
z = df[c("Blue","Hazel", "Green")]
head(z)
# Select one column
df[["Eye"]].head()
# or
df.Eye.head()
z = df[,.(Eye)]
head(z)
# Select two columns
df[["Eye","Sex"]].head()
z = df[,.(Eye,Sex)]
head(z)
# Unique Eye colors
df["Eye"].unique()
df[,unique(Eye)]
# Maximum of the "Freq" column
df.Freq.max()
df[,max(Freq)]
# Call functions on multiple columns
import numpy as np
pd.DataFrame({'Max_freq': [df.Freq.max()], \
'Min_freq': [df.Freq.min()],\
'Std_freq': [np.std(df.Freq)]})
df[,.(Max_freq =
max(Freq),
Min_freq =
min(Freq),
Std_freq =
sd(Freq))]
# Maximum Frequency by Sex
df.groupby("Sex").agg({"Freq":"max"})
df[, max(Freq),
by = Sex]
# Count by Eye color and Sex
df.groupby(["Eye","Sex"]).agg({"Freq":"count"}).rename(columns={"Freq":"Count"})
df[,.(Count = .N),
by = .(Eye, Sex)]
# Call functions for grouping
df.assign(Gt50 = (df.Freq > 50)).groupby("Gt50").agg({"Gt50":"count"})\
.rename(columns ={"Gt50":"Count"})
df[,.(Count = .N),
by = .(Gt50 =
Freq > 50)]
# Do the analysis on selected rows only
pd.DataFrame({'Max_freq': [df[0:10].Freq.max()], \
'Min_freq': [df[0:10].Freq.min()],\
'Std_freq': [np.std(df[0:10].Freq)]})
df[1:10,.(Max_freq =
max(Freq),
Min_freq =
min(Freq),
Std_freq =
sd(Freq))]
# Remove a column
df.drop('Unnamed: 0', 1).head()
head(df[, V1 := NULL])
# Return the first occurance
df.query("Eye == 'Blue'")[:1]
setkey(df, Eye)
df["Blue", mult = "first"]
# Return the last occurance
df.query("Eye == 'Blue'")[-1:]
setkey(df, Eye)
df["Blue", mult = "last"]
# Return a count
df[df.Eye.isin(['Blue','Hazel']) & (df.Sex=="Male")].shape[0]
setkey(df, Eye, Sex)
df[.(c("Blue","Hazel"), "Male"), .N]
# Count for each group
df[df.Eye.isin(['Blue','Hazel']) & (df.Sex=="Male")].\
groupby(["Eye","Sex"]).agg({"Freq":"count"}).rename(columns={"Freq":"Count"})
setkey(df, Eye, Sex)
df[.(c("Blue","Hazel"),
"Male"), .N,
by =.EACHI]
# Order in ascending order
df.sort_values(by='Freq').tail(6)
tail(df[order(Freq)])
# Order in descending order
df.sort_values(by='Freq', ascending = False).tail(6)
tail(setorder(df, -Freq))
# "Freq" in descending and "Eye" in ascending
df.sort_values(by=['Freq','Eye'], ascending = [False,True]).tail(6)
tail(setorder(df, -Freq, Eye))
# Rename columns
df.rename(columns = {"Freq":"Frequency","Eye":"Eye_Color"}).tail()
setnames(df,c("Freq","Eye"),c("Frequency", "Eye_Color"))
tail(df)
# Unique rows
df[["Eye","Sex"]].drop_duplicates()
unique(df[,.(Eye_Color,Sex)])
# Create new column
df.assign(Eye_Hair =df.Eye + df.Hair)[["Eye","Hair","Eye_Hair"]].head()
z = df[, Eye_Hair := paste(Eye_Color, Hair)][,.(Eye_Color, Hair, Eye_Hair)]
head(z)