Pandas DataFrame¶

In [ ]:
import pandas as pd
In [ ]:
d = { "SNAME": ["Alice","Bob","Charlie"], "AGE": [24, 32, 26], "MAJOR": ["CSC","MATH","CSC"]}
df = pd.DataFrame(d)
In [ ]:
print(df)
print(df.shape)  # I ran this cell after renaming the index
     SNAME  AGE MAJOR
0    Alice   24   CSC
1      Bob   32  MATH
2  Charlie   26   CSC
(3, 3)
In [ ]:
print(df.columns)
print(df.index)
Index(['SNAME', 'AGE', 'MAJOR'], dtype='object')
RangeIndex(start=0, stop=3, step=1)
In [ ]:
df["SNAME"]  # project out columns
Out[ ]:
0      Alice
1        Bob
2    Charlie
Name: SNAME, dtype: object
In [ ]:
df[["SNAME","MAJOR"]]
Out[ ]:
SNAME MAJOR
0 Alice CSC
1 Bob MATH
2 Charlie CSC
In [ ]:
df[0:2]
Out[ ]:
SNAME AGE MAJOR
0 Alice 24 CSC
1 Bob 32 MATH
In [ ]:
df.index = ["s1","s2","s3"]
df
Out[ ]:
SNAME AGE MAJOR
s1 Alice 24 CSC
s2 Bob 32 MATH
s3 Charlie 26 CSC
In [ ]:
# selection of rows
df[df["AGE"] < 30]
Out[ ]:
SNAME AGE MAJOR
s1 Alice 24 CSC
s3 Charlie 26 CSC
In [ ]:
df["s1":"s1"]
Out[ ]:
SNAME AGE MAJOR
s1 Alice 24 CSC
In [ ]:
df.loc["s1":"s2"]
Out[ ]:
SNAME AGE MAJOR
s1 Alice 24 CSC
s2 Bob 32 MATH
In [ ]:
df.loc["s1":"s2",["SNAME","AGE"]]
Out[ ]:
SNAME AGE
s1 Alice 24
s2 Bob 32
In [ ]:
df.iloc[1:2,[1,2]]
Out[ ]:
AGE MAJOR
s2 32 MATH
In [ ]:
df.at["s2","AGE"]
Out[ ]:
32
In [ ]:
df.columns = ["sname","age","major"]
print(df)
      sname  age major
s1    Alice   24   CSC
s2      Bob   32  MATH
s3  Charlie   26   CSC
In [ ]:
df.query('age < 30')
Out[ ]:
sname age major
s1 Alice 24 CSC
s3 Charlie 26 CSC
In [ ]:
df.query('age < 30 and sname.str.startswith("A")')  # corresponds to WHERE clause in SQL 
Out[ ]:
sname age major
s1 Alice 24 CSC

PLOTLY¶

In [ ]:
# produce a x-y plot for computer science; x=semester, y=enrollment
import plotly.express as px

enroll = pd.read_csv("enrollment_by_department.csv")
print(enroll.shape)
print(enroll.columns)
print(enroll.index)
(70, 11)
Index(['Department', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019',
       'F2020', 'F2021', 'F2022', 'F2023'],
      dtype='object')
RangeIndex(start=0, stop=70, step=1)
In [ ]:
print(enroll)
                               Department  F2014  F2015  F2016  F2017  F2018  \
0                        Accountancy (AC)    285    274    276    289    252   
1                      Accountancy (ACCT)   1099   1002    889    769    814   
2                  Africana Studies (AAS)     54     69     82     82     89   
3                     Anthropology (ANTH)    185    185    173    176    188   
4      Applied Linguistics And Esl (ALSL)    226    257    243    236    237   
..                                    ...    ...    ...    ...    ...    ...   
65                       Social Work (SW)    410    400    357    338    350   
66                        Sociology (SOC)    620    652    697    658    619   
67          Urban Studies Institute (URB)      0      0      0      0      0   
68  Women'S Gender & Sexuality Stu (WGSS)     47     54     41     46     50   
69     World Languages And Cultures (WLC)      0     53    110    142    156   

    F2019  F2020  F2021  F2022  F2023  
0     218    236    216    172    106  
1     781    744    677    657    546  
2      78     81     80     80     52  
3     182    181    166    160    124  
4     243    224    199    163    121  
..    ...    ...    ...    ...    ...  
65    376    396    426    410    368  
66    567    544    501    496    357  
67      5      7      7     11     12  
68     49     47     54     50     35  
69    144    128    114    100     63  

[70 rows x 11 columns]
In [ ]:
csc = enroll[enroll["Department"] == "Computer Science (CSC)"].T[1:]
print(csc)
         12
F2014  1237
F2015  1383
F2016  1581
F2017  1733
F2018  1972
F2019  2162
F2020  2340
F2021  2635
F2022  3236
F2023  2975
In [ ]:
csc.reset_index(inplace=True)
print(csc)
   index    12
0  F2014  1237
1  F2015  1383
2  F2016  1581
3  F2017  1733
4  F2018  1972
5  F2019  2162
6  F2020  2340
7  F2021  2635
8  F2022  3236
9  F2023  2975
In [ ]:
csc.columns = ["semester","enrollment"]
print(csc)
  semester enrollment
0    F2014       1237
1    F2015       1383
2    F2016       1581
3    F2017       1733
4    F2018       1972
5    F2019       2162
6    F2020       2340
7    F2021       2635
8    F2022       3236
9    F2023       2975
In [ ]:
fig = px.line(csc,x="semester",y="enrollment",title="Computer Science Enrollments")
fig.show()
In [ ]:
# Plot csc and chem in one line plot
csc  = enroll[enroll["Department"] == "Computer Science (CSC)"].T[1:]
csc.reset_index(inplace=True)
csc["department"] = "CSC"
csc = csc.iloc[:,[2,0,1]]
csc.columns = ['department','semester','enrollment']
#print(csc)

chem = enroll[enroll["Department"] == "Chemistry (CHEM)"].T[1:]
chem.reset_index(inplace=True)
chem["department"] = "CHEM"
chem = chem.iloc[:,[2,0,1]]
chem.columns = ['department','semester','enrollment']
#print(chem)

csc_chem = pd.concat([csc,chem])

print(csc_chem)

fig = px.line(csc_chem, x="semester", y="enrollment", color="department", title='Enrollments: CSC v CHEM')
fig.show()
  department semester enrollment
0        CSC    F2014       1237
1        CSC    F2015       1383
2        CSC    F2016       1581
3        CSC    F2017       1733
4        CSC    F2018       1972
5        CSC    F2019       2162
6        CSC    F2020       2340
7        CSC    F2021       2635
8        CSC    F2022       3236
9        CSC    F2023       2975
0       CHEM    F2014        703
1       CHEM    F2015        709
2       CHEM    F2016        754
3       CHEM    F2017        694
4       CHEM    F2018        669
5       CHEM    F2019        625
6       CHEM    F2020        581
7       CHEM    F2021        505
8       CHEM    F2022        439
9       CHEM    F2023        358
In [ ]:
def plot_two_departments(df,dept1,dept2):
    d1  = df[df["Department"] == dept1].T[1:]
    d1.reset_index(inplace=True)
    d1["department"] = dept1
    d1 = d1.iloc[:,[2,0,1]]
    d1.columns = ['department','semester','enrollment']

    d2  = df[df["Department"] == dept2].T[1:]
    d2.reset_index(inplace=True)
    d2["department"] = dept2
    d2 = d2.iloc[:,[2,0,1]]
    d2.columns = ['department','semester','enrollment']

    combined = pd.concat([d1,d2])

    print(combined)

    fig = px.line(combined, x="semester", y="enrollment", color="department", 
                  title='Enrollments: '+dept1+' v ' + dept2)
    fig.show()
In [ ]:
plot_two_departments(enroll,"Computer Science (CSC)","Chemistry (CHEM)")
               department semester enrollment
0  Computer Science (CSC)    F2014       1237
1  Computer Science (CSC)    F2015       1383
2  Computer Science (CSC)    F2016       1581
3  Computer Science (CSC)    F2017       1733
4  Computer Science (CSC)    F2018       1972
5  Computer Science (CSC)    F2019       2162
6  Computer Science (CSC)    F2020       2340
7  Computer Science (CSC)    F2021       2635
8  Computer Science (CSC)    F2022       3236
9  Computer Science (CSC)    F2023       2975
0        Chemistry (CHEM)    F2014        703
1        Chemistry (CHEM)    F2015        709
2        Chemistry (CHEM)    F2016        754
3        Chemistry (CHEM)    F2017        694
4        Chemistry (CHEM)    F2018        669
5        Chemistry (CHEM)    F2019        625
6        Chemistry (CHEM)    F2020        581
7        Chemistry (CHEM)    F2021        505
8        Chemistry (CHEM)    F2022        439
9        Chemistry (CHEM)    F2023        358
In [ ]:
plot_two_departments(enroll,"Computer Science (CSC)","Biology (BIO)")
               department semester enrollment
0  Computer Science (CSC)    F2014       1237
1  Computer Science (CSC)    F2015       1383
2  Computer Science (CSC)    F2016       1581
3  Computer Science (CSC)    F2017       1733
4  Computer Science (CSC)    F2018       1972
5  Computer Science (CSC)    F2019       2162
6  Computer Science (CSC)    F2020       2340
7  Computer Science (CSC)    F2021       2635
8  Computer Science (CSC)    F2022       3236
9  Computer Science (CSC)    F2023       2975
0           Biology (BIO)    F2014       2611
1           Biology (BIO)    F2015       2465
2           Biology (BIO)    F2016       2447
3           Biology (BIO)    F2017       2408
4           Biology (BIO)    F2018       2537
5           Biology (BIO)    F2019       2575
6           Biology (BIO)    F2020       2611
7           Biology (BIO)    F2021       2636
8           Biology (BIO)    F2022       2366
9           Biology (BIO)    F2023       2003