import pandas as pd
d = { "SNAME": ["Alice","Bob","Charlie"], "AGE": [24, 32, 26], "MAJOR": ["CSC","MATH","CSC"]}
df = pd.DataFrame(d)
print(df)
print(df.shape) # I ran this cell after renaming the index
SNAME AGE MAJOR 0 Alice 24 CSC 1 Bob 32 MATH 2 Charlie 26 CSC (3, 3)
print(df.columns)
print(df.index)
Index(['SNAME', 'AGE', 'MAJOR'], dtype='object') RangeIndex(start=0, stop=3, step=1)
df["SNAME"] # project out columns
0 Alice 1 Bob 2 Charlie Name: SNAME, dtype: object
df[["SNAME","MAJOR"]]
SNAME | MAJOR | |
---|---|---|
0 | Alice | CSC |
1 | Bob | MATH |
2 | Charlie | CSC |
df[0:2]
SNAME | AGE | MAJOR | |
---|---|---|---|
0 | Alice | 24 | CSC |
1 | Bob | 32 | MATH |
df.index = ["s1","s2","s3"]
df
SNAME | AGE | MAJOR | |
---|---|---|---|
s1 | Alice | 24 | CSC |
s2 | Bob | 32 | MATH |
s3 | Charlie | 26 | CSC |
# selection of rows
df[df["AGE"] < 30]
SNAME | AGE | MAJOR | |
---|---|---|---|
s1 | Alice | 24 | CSC |
s3 | Charlie | 26 | CSC |
df["s1":"s1"]
SNAME | AGE | MAJOR | |
---|---|---|---|
s1 | Alice | 24 | CSC |
df.loc["s1":"s2"]
SNAME | AGE | MAJOR | |
---|---|---|---|
s1 | Alice | 24 | CSC |
s2 | Bob | 32 | MATH |
df.loc["s1":"s2",["SNAME","AGE"]]
SNAME | AGE | |
---|---|---|
s1 | Alice | 24 |
s2 | Bob | 32 |
df.iloc[1:2,[1,2]]
AGE | MAJOR | |
---|---|---|
s2 | 32 | MATH |
df.at["s2","AGE"]
32
df.columns = ["sname","age","major"]
print(df)
sname age major s1 Alice 24 CSC s2 Bob 32 MATH s3 Charlie 26 CSC
df.query('age < 30')
sname | age | major | |
---|---|---|---|
s1 | Alice | 24 | CSC |
s3 | Charlie | 26 | CSC |
df.query('age < 30 and sname.str.startswith("A")') # corresponds to WHERE clause in SQL
sname | age | major | |
---|---|---|---|
s1 | Alice | 24 | CSC |
# produce a x-y plot for computer science; x=semester, y=enrollment
import plotly.express as px
enroll = pd.read_csv("enrollment_by_department.csv")
print(enroll.shape)
print(enroll.columns)
print(enroll.index)
(70, 11) Index(['Department', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019', 'F2020', 'F2021', 'F2022', 'F2023'], dtype='object') RangeIndex(start=0, stop=70, step=1)
print(enroll)
Department F2014 F2015 F2016 F2017 F2018 \ 0 Accountancy (AC) 285 274 276 289 252 1 Accountancy (ACCT) 1099 1002 889 769 814 2 Africana Studies (AAS) 54 69 82 82 89 3 Anthropology (ANTH) 185 185 173 176 188 4 Applied Linguistics And Esl (ALSL) 226 257 243 236 237 .. ... ... ... ... ... ... 65 Social Work (SW) 410 400 357 338 350 66 Sociology (SOC) 620 652 697 658 619 67 Urban Studies Institute (URB) 0 0 0 0 0 68 Women'S Gender & Sexuality Stu (WGSS) 47 54 41 46 50 69 World Languages And Cultures (WLC) 0 53 110 142 156 F2019 F2020 F2021 F2022 F2023 0 218 236 216 172 106 1 781 744 677 657 546 2 78 81 80 80 52 3 182 181 166 160 124 4 243 224 199 163 121 .. ... ... ... ... ... 65 376 396 426 410 368 66 567 544 501 496 357 67 5 7 7 11 12 68 49 47 54 50 35 69 144 128 114 100 63 [70 rows x 11 columns]
csc = enroll[enroll["Department"] == "Computer Science (CSC)"].T[1:]
print(csc)
12 F2014 1237 F2015 1383 F2016 1581 F2017 1733 F2018 1972 F2019 2162 F2020 2340 F2021 2635 F2022 3236 F2023 2975
csc.reset_index(inplace=True)
print(csc)
index 12 0 F2014 1237 1 F2015 1383 2 F2016 1581 3 F2017 1733 4 F2018 1972 5 F2019 2162 6 F2020 2340 7 F2021 2635 8 F2022 3236 9 F2023 2975
csc.columns = ["semester","enrollment"]
print(csc)
semester enrollment 0 F2014 1237 1 F2015 1383 2 F2016 1581 3 F2017 1733 4 F2018 1972 5 F2019 2162 6 F2020 2340 7 F2021 2635 8 F2022 3236 9 F2023 2975
fig = px.line(csc,x="semester",y="enrollment",title="Computer Science Enrollments")
fig.show()
# Plot csc and chem in one line plot
csc = enroll[enroll["Department"] == "Computer Science (CSC)"].T[1:]
csc.reset_index(inplace=True)
csc["department"] = "CSC"
csc = csc.iloc[:,[2,0,1]]
csc.columns = ['department','semester','enrollment']
#print(csc)
chem = enroll[enroll["Department"] == "Chemistry (CHEM)"].T[1:]
chem.reset_index(inplace=True)
chem["department"] = "CHEM"
chem = chem.iloc[:,[2,0,1]]
chem.columns = ['department','semester','enrollment']
#print(chem)
csc_chem = pd.concat([csc,chem])
print(csc_chem)
fig = px.line(csc_chem, x="semester", y="enrollment", color="department", title='Enrollments: CSC v CHEM')
fig.show()
department semester enrollment 0 CSC F2014 1237 1 CSC F2015 1383 2 CSC F2016 1581 3 CSC F2017 1733 4 CSC F2018 1972 5 CSC F2019 2162 6 CSC F2020 2340 7 CSC F2021 2635 8 CSC F2022 3236 9 CSC F2023 2975 0 CHEM F2014 703 1 CHEM F2015 709 2 CHEM F2016 754 3 CHEM F2017 694 4 CHEM F2018 669 5 CHEM F2019 625 6 CHEM F2020 581 7 CHEM F2021 505 8 CHEM F2022 439 9 CHEM F2023 358
def plot_two_departments(df,dept1,dept2):
d1 = df[df["Department"] == dept1].T[1:]
d1.reset_index(inplace=True)
d1["department"] = dept1
d1 = d1.iloc[:,[2,0,1]]
d1.columns = ['department','semester','enrollment']
d2 = df[df["Department"] == dept2].T[1:]
d2.reset_index(inplace=True)
d2["department"] = dept2
d2 = d2.iloc[:,[2,0,1]]
d2.columns = ['department','semester','enrollment']
combined = pd.concat([d1,d2])
print(combined)
fig = px.line(combined, x="semester", y="enrollment", color="department",
title='Enrollments: '+dept1+' v ' + dept2)
fig.show()
plot_two_departments(enroll,"Computer Science (CSC)","Chemistry (CHEM)")
department semester enrollment 0 Computer Science (CSC) F2014 1237 1 Computer Science (CSC) F2015 1383 2 Computer Science (CSC) F2016 1581 3 Computer Science (CSC) F2017 1733 4 Computer Science (CSC) F2018 1972 5 Computer Science (CSC) F2019 2162 6 Computer Science (CSC) F2020 2340 7 Computer Science (CSC) F2021 2635 8 Computer Science (CSC) F2022 3236 9 Computer Science (CSC) F2023 2975 0 Chemistry (CHEM) F2014 703 1 Chemistry (CHEM) F2015 709 2 Chemistry (CHEM) F2016 754 3 Chemistry (CHEM) F2017 694 4 Chemistry (CHEM) F2018 669 5 Chemistry (CHEM) F2019 625 6 Chemistry (CHEM) F2020 581 7 Chemistry (CHEM) F2021 505 8 Chemistry (CHEM) F2022 439 9 Chemistry (CHEM) F2023 358
plot_two_departments(enroll,"Computer Science (CSC)","Biology (BIO)")
department semester enrollment 0 Computer Science (CSC) F2014 1237 1 Computer Science (CSC) F2015 1383 2 Computer Science (CSC) F2016 1581 3 Computer Science (CSC) F2017 1733 4 Computer Science (CSC) F2018 1972 5 Computer Science (CSC) F2019 2162 6 Computer Science (CSC) F2020 2340 7 Computer Science (CSC) F2021 2635 8 Computer Science (CSC) F2022 3236 9 Computer Science (CSC) F2023 2975 0 Biology (BIO) F2014 2611 1 Biology (BIO) F2015 2465 2 Biology (BIO) F2016 2447 3 Biology (BIO) F2017 2408 4 Biology (BIO) F2018 2537 5 Biology (BIO) F2019 2575 6 Biology (BIO) F2020 2611 7 Biology (BIO) F2021 2636 8 Biology (BIO) F2022 2366 9 Biology (BIO) F2023 2003