import pandas as pd
import numpy as np

df = pd.read_csv('olympics.csv', index_col=0, skiprows=2)

df.head(3)

df.tail(3)

df = df.drop('Totals')
df.tail(3)

df.shape

(146, 15)

df.size

2190

df.columns

Index(['Summer games', 'Summer gold', 'Summer silver', 'Summer bronze',
       'Summer total', 'Winter games', 'Winter gold', 'Winter silver',
       'Winter bronze', 'Winter total', 'Combined games', 'Combined gold',
       'Combined silver', 'Combined bronze', 'Combined total'],
      dtype='object')

df.index

Index(['Afghanistan (AFG)', 'Algeria (ALG)', 'Argentina (ARG)',
       'Armenia (ARM)', 'Australasia (ANZ) [ANZ]', 'Australia (AUS) [AUS] [Z]',
       'Austria (AUT)', 'Azerbaijan (AZE)', 'Bahamas (BAH)', 'Bahrain (BRN)',
       ...
       'Uruguay (URU)', 'Uzbekistan (UZB)', 'Venezuela (VEN)', 'Vietnam (VIE)',
       'Virgin Islands (ISV)', 'Yugoslavia (YUG) [YUG]',
       'Independent Olympic Participants (IOP) [IOP]', 'Zambia (ZAM) [ZAM]',
       'Zimbabwe (ZIM) [ZIM]', 'Mixed team (ZZX) [ZZX]'],
      dtype='object', length=146)

names_ids = df.index.str.split('\s\(')
df.index = names_ids.str[0]
df.head(3)

# Return a Series selecting row 2 (iloc and single brackets)
df.iloc[2]

Summer games       23
Summer gold        18
Summer silver      24
Summer bronze      28
Summer total       70
Winter games       18
Winter gold         0
Winter silver       0
Winter bronze       0
Winter total        0
Combined games     41
Combined gold      18
Combined silver    24
Combined bronze    28
Combined total     70
Name: Argentina, dtype: int64

# Return a DataFrame selecting row 2-6 (iloc and single brackets)
df.iloc[2:5]

# Return a single-row Data Frame selecting row 2 (iloc and double brackets)
df.iloc[[2]]

# Return a Series for row labeled France (loc and single brackets)
df.loc['France']

Summer games        27
Summer gold        202
Summer silver      223
Summer bronze      246
Summer total       671
Winter games        22
Winter gold         31
Winter silver       31
Winter bronze       47
Winter total       109
Combined games      49
Combined gold      233
Combined silver    254
Combined bronze    293
Combined total     780
Name: France, dtype: int64

# Return a DataFrame for rows labeled France and Germany (loc, single brackets, list argument)
countries = ['France', 'Germany']
df.loc[countries]

# Return a single-row DataFrame for row labeled France
df.loc[['France']]

# Return column as a Series (column name and single brackets)
df['Summer silver'].head()

Afghanistan     0
Algeria         2
Argentina      24
Armenia         2
Australasia     4
Name: Summer silver, dtype: int64

# Return column as a DataFrame (column name and double brackets)
df[['Summer silver']].head()

# Return multiple columns as DataFrame (list of column names and single brackets)
cnames = ['Summer gold', 'Summer silver', 'Summer bronze']
df[cnames].head()

df['Combined weighted'] = df['Combined gold']*3 + df['Combined silver']*2 + df['Combined bronze']
df.head()

# This will also work: "df.drop('Combined weighted', 1)"

del df['Combined weighted']
df.head()

df['Winter gold'].max()

118

df['Winter gold'].idxmax()

'Norway'

df.loc[ df['Winter gold']>50 ]

df2 = df.loc[ (df['Winter gold']>50) & (df['Summer gold']>50) ]
df2

df2 = df2[['Summer gold', 'Winter gold']]
df2

df2['Winter gold'].sum()

432

df[["Winter gold", "Winter silver", "Winter bronze"]].iloc[10:15]

df[["Winter gold", "Winter silver", "Winter bronze"]][10:15]

df[["Winter gold", "Winter silver", "Winter bronze"]].loc[['Barbados', 'Belarus', 'Belgium']]

# Missing values are dislayed as NaNs (Not a Number)
df3 = pd.read_csv('city temps.csv', index_col=0, skiprows=0)
df3

# The simplest method is to replace missing values with a fixed value
df3.fillna(value=70)

# Forward filling using the last valid value to fill missing value
# Note that in this case we used axis=1 so that we propogate across
# the rows rather than the values

df3.fillna(method="ffill", axis=1)

# Back filling using the NEXT valid value to fill missing value
# Note that in this case we used axis=1 so that we propogate across
# the rows rather than the values

df3.fillna(method="bfill", axis=1)

# As of version 0.17.0, pandas provides an interpolate function that fills missing values
# By default, we get a linear interpolation, but a number of other options are available
# (quadratic, cubic, polynomial, etc.)

df3.interpolate(method="linear", axis=1)

df4 = pd.read_excel('city temps spreadsheet.xlsx', index_col=0, skiprows=0, sheet_name='set1')
df4

df5 = pd.read_excel('city temps spreadsheet.xlsx', index_col=0, skiprows=0, sheet_name='set2')
df5

a = np.random.rand(10,3)
a

array([[0.93767545, 0.52905276, 0.37840219],
       [0.69301731, 0.65304621, 0.3063061 ],
       [0.69479708, 0.92496885, 0.22242732],
       [0.03814035, 0.27154036, 0.59482261],
       [0.41736065, 0.15261061, 0.7636656 ],
       [0.01100607, 0.64267618, 0.28872264],
       [0.61810238, 0.30143727, 0.09309254],
       [0.64356449, 0.97165521, 0.11659938],
       [0.151108  , 0.658962  , 0.05964654],
       [0.90910788, 0.85471942, 0.51000376]])

df6 = pd.DataFrame(a, columns=['feature 1', 'feature 2', 'feature 3'])
df6

	Sunday	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday
New York	71.0	NaN	75.0	78.0	80.0	81.0	79.0
Boston	58.0	56.0	NaN	54.0	50.0	61.0	63.0
Dallas	92.0	91.0	90.0	NaN	NaN	85.0	82.0
San Diego	72.0	72.0	72.0	70.0	NaN	71.0	68.0
Seattle	61.0	63.0	61.0	NaN	60.0	61.0	68.0

	Sunday	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday
New York	71.0	70.0	75.0	78.0	80.0	81.0	79.0
Boston	58.0	56.0	70.0	54.0	50.0	61.0	63.0
Dallas	92.0	91.0	90.0	70.0	70.0	85.0	82.0
San Diego	72.0	72.0	72.0	70.0	70.0	71.0	68.0
Seattle	61.0	63.0	61.0	70.0	60.0	61.0	68.0

	Sunday	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday
New York	71.0	71.0	75.0	78.0	80.0	81.0	79.0
Boston	58.0	56.0	56.0	54.0	50.0	61.0	63.0
Dallas	92.0	91.0	90.0	90.0	90.0	85.0	82.0
San Diego	72.0	72.0	72.0	70.0	70.0	71.0	68.0
Seattle	61.0	63.0	61.0	61.0	60.0	61.0	68.0

	Sunday	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday
New York	71.0	75.0	75.0	78.0	80.0	81.0	79.0
Boston	58.0	56.0	54.0	54.0	50.0	61.0	63.0
Dallas	92.0	91.0	90.0	85.0	85.0	85.0	82.0
San Diego	72.0	72.0	72.0	70.0	71.0	71.0	68.0
Seattle	61.0	63.0	61.0	60.0	60.0	61.0	68.0

	Sunday	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday
New York	71.0	73.0	75.0	78.000000	80.000000	81.0	79.0
Boston	58.0	56.0	55.0	54.000000	50.000000	61.0	63.0
Dallas	92.0	91.0	90.0	88.333333	86.666667	85.0	82.0
San Diego	72.0	72.0	72.0	70.000000	70.500000	71.0	68.0
Seattle	61.0	63.0	61.0	60.500000	60.000000	61.0	68.0

Source Information¶

Goal¶

Pandas - a quick introduction¶

Required Modules for the Jupyter Notebook¶

Load CSV Data Set¶

Single and double square brackets / accessing rows and columns¶

Selecting rows¶

Selecting columns¶

Adding and deleting columns¶

Putting it all together¶

Missing values¶

Reading from other file formats¶

From numpy array to data frame¶

Submit Ticket¶

	Summer games	Summer gold	Summer silver	Summer bronze	Summer total	Winter games	Combined games	Combined gold	Combined silver	Combined bronze	Combined total
Afghanistan (AFG)	13	0	0	2	2	0	13	0	0	2	2
Algeria (ALG)	12	5	2	8	15	3	15	5	2	8	15
Argentina (ARG)	23	18	24	28	70	18	41	18	24	28	70

	Summer games	Summer gold	Summer silver	Summer bronze	Summer total	Winter games	Winter gold	Winter silver	Winter bronze	Winter total	Combined games	Combined gold	Combined silver	Combined bronze	Combined total
Zimbabwe (ZIM) [ZIM]	12	3	4	1	8	1	0	0	0	0	13	3	4	1	8
Mixed team (ZZX) [ZZX]	3	8	5	4	17	0	0	0	0	0	3	8	5	4	17
Totals	27	4809	4775	5130	14714	22	959	958	948	2865	49	5768	5733	6078	17579

	Summer games	Summer gold	Summer silver	Summer bronze	Summer total	Winter games	Combined games	Combined gold	Combined silver	Combined bronze	Combined total
Zambia (ZAM) [ZAM]	12	0	1	1	2	0	12	0	1	1	2
Zimbabwe (ZIM) [ZIM]	12	3	4	1	8	1	13	3	4	1	8
Mixed team (ZZX) [ZZX]	3	8	5	4	17	0	3	8	5	4	17

	Summer games	Summer gold	Summer silver	Summer bronze	Summer total	Winter games	Combined games	Combined gold	Combined silver	Combined bronze	Combined total
Afghanistan	13	0	0	2	2	0	13	0	0	2	2
Algeria	12	5	2	8	15	3	15	5	2	8	15
Argentina	23	18	24	28	70	18	41	18	24	28	70

	Summer games	Summer gold	Summer silver	Summer bronze	Summer total	Winter games	Combined games	Combined gold	Combined silver	Combined bronze	Combined total
Argentina	23	18	24	28	70	18	41	18	24	28	70
Armenia	5	1	2	9	12	6	11	1	2	9	12
Australasia	2	3	4	5	12	0	2	3	4	5	12

	Summer games	Summer gold	Summer silver	Summer bronze	Summer total	Winter games	Winter gold	Winter silver	Winter bronze	Winter total	Combined games	Combined gold	Combined silver	Combined bronze	Combined total
France	27	202	223	246	671	22	31	31	47	109	49	233	254	293	780
Germany	15	174	182	217	573	11	78	78	53	209	26	252	260	270	782

	Summer games	Summer gold	Summer silver	Summer bronze	Summer total	Winter games	Winter gold	Winter silver	Winter bronze	Winter total	Combined games	Combined gold	Combined silver	Combined bronze	Combined total
Austria	26	18	33	35	86	22	59	78	81	218	48	77	111	116	304
Canada	25	59	99	121	279	22	62	56	52	170	47	121	155	173	449
Germany	15	174	182	217	573	11	78	78	53	209	26	252	260	270	782
Norway	24	56	49	43	148	22	118	111	100	329	46	174	160	143	477
Soviet Union	9	395	319	296	1010	9	78	57	59	194	18	473	376	355	1204
United States	26	976	757	666	2399	22	96	102	84	282	48	1072	859	750	2681

	Sunday	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday
Atlanta	71	NaN	75.0	78.0	80.0	81	79
Portland	58	56.0	NaN	54.0	50.0	61	63
Phoenix	92	91.0	90.0	NaN	NaN	85	82
Las Vegas	72	72.0	72.0	70.0	NaN	71	68
Chicago	61	63.0	61.0	NaN	60.0	61	68

	feature 1	feature 2	feature 3
0	0.937675	0.529053	0.378402
1	0.693017	0.653046	0.306306
2	0.694797	0.924969	0.222427
3	0.038140	0.271540	0.594823
4	0.417361	0.152611	0.763666
5	0.011006	0.642676	0.288723
6	0.618102	0.301437	0.093093
7	0.643564	0.971655	0.116599
8	0.151108	0.658962	0.059647
9	0.909108	0.854719	0.510004