Pandas는 고수준의 자료구조와 파이썬을 통한 빠르고 쉬운
Series
- 일련의 객체를 담을 수 있는 1차원 벡터
- index(색인)라고 하는 배열의 데이터에 연관되 이름을 가지고 있다.
import pandas as pd
import numpy as np
obj = pd.Series([-4,7,-4,3])
print('\n',obj, '\n')
print('\n',obj.values, '\n')
print('\n',obj.index, '\n')
obj2 = pd.Series([-4,7,-4,3], index=['d','b','a','c'])
print('\n',obj2, '\n')
print('\n',obj2.index, '\n')
print('\n',obj2['a'], '\n')
obj2['d'] = 6
print('\n',obj2, '\n')
sdata = {'Ohio':35000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = pd.Series(sdata)
print('\n',obj3, '\n')
print('\n',obj2[obj2 >0], '\n')
print('\n',obj2*2, '\n')
print('\n',np.exp(obj2), '\n')
states = ['Califonia', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
print('\n',obj4, '\n')
print('\n',pd.isnull(obj4), '\n')
print('\n',pd.notnull(obj4), '\n')
print('\n',obj4.isnull(), '\n')
print('\n',obj3 + obj4, '\n')
print("-----------------------------------------------------------------------------")
data = {'states':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
'year':[2000, 2001, 2002, 2001, 2002, 2003],
'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(data)
print('\n',frame,'\n')
print('\n',pd.DataFrame(data, columns=['year', 'states', 'pop']), '\n')
frame2 = pd.DataFrame(data, columns=['year', 'states', 'pop', 'debt'],
index=['one','tow','three','four','five','six'])
print('\n',frame2,'\n')
print('\n',frame2['states'],'\n')
print('\n',frame2.year,'\n')
print('\n',frame2.loc['three'],'\n')
frame2['debt'] = 17.5
print('\n',frame2,'\n')
frame2['debt'] = np.arange(6.)
print('\n',frame2,'\n')
print("-----------------------------------------------------------------------------")
obj = pd.Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
print('\n', obj)
obj2 = obj.reindex(['a','b','c','d','e'])
print('\n', obj2)
obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0)
print('\n', obj2)
obj3 = pd.Series(["blue","purple","yellow"], index=[0,2,4])
print('\n', obj3)
print('\n', obj3.reindex(range(6), method='ffill'))
print('\n', obj3.reindex(range(6), method='backfill'))
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index=['a','b','c'], columns=['Ohio', 'Texas', 'California'])
print('\n', frame)
frame2 = frame.reindex(['a','b','c','d'])
print('\n', frame2)
states = ['Texas', 'Utah', 'California']
print('\n', frame.reindex(columns=states))
print("-----------------------------------------------------------------------------")
obj = pd.Series(np.arange(5.), index=['a','b','c','d','e'])
print('\n', obj)
new_obj = obj.drop('c')
print('\n', new_obj)
print('\n', obj.drop(['d', 'c']))
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])
print('\n', data)
print('\n', data.drop(['Colorado', 'Ohio']))
print('\n', data.drop('two', axis=1))
print('\n', data.drop(['two', 'four'], axis='columns'))
print("-----------------------------------------------------------------------------")
obj = pd.Series(np.arange(4.), index=['a','b','c','d'])
print('\n', obj)
print('\n', obj['b'])
print('\n', obj[1])
print('\n', obj[2:4])
print('\n', obj[['b','a','c']])
print('\n', obj[[1,3]])
print('\n', obj[obj<2])
print('\n', obj['b':'c'])
obj['b':'c'] = 5
print('\n', obj)
print("-----------------------------------------------------------------------------")
data = pd.DataFrame(np.arange(16).reshape(4,4),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
print('\n', data)
print('\n', data['two'])
print('\n', data[['two','three']])
print('\n', data[:2])
print('\n', data[data['three'] > 5])
data[data < 5] = 0
print('\n', data)
print('\n', data.loc['Colorado', ['two', 'three']])
print('\n', data.iloc[2,[3,0,1]])
print('\n', data.iloc[[1,2],[3,0,1]])
#print('\n', data.iloc[[1,2], ['three']])
print('\n', data[['one','two']])
print('\n', data['three'] > 5)
#모든 칼럼 중에 one,tow를 포함하면서 three의 값이 5보다 큰 것만 추출
print('\n', data.iloc[:, :2][data.three > 5])
0 -4
1 7
2 -4
3 3
dtype: int64
[-4 7 -4 3]
RangeIndex(start=0, stop=4, step=1)
d -4
b 7
a -4
c 3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')
-4
d 6
b 7
a -4
c 3
dtype: int64
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
d 6
b 7
c 3
dtype: int64
d 12
b 14
a -8
c 6
dtype: int64
d 403.428793
b 1096.633158
a 0.018316
c 20.085537
dtype: float64
Califonia NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
Califonia True
Ohio False
Oregon False
Texas False
dtype: bool
Califonia False
Ohio True
Oregon True
Texas True
dtype: bool
Califonia True
Ohio False
Oregon False
Texas False
dtype: bool
Califonia NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
-----------------------------------------------------------------------------
states year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
year states pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
5 2003 Nevada 3.2
year states pop debt
one 2000 Ohio 1.5 NaN
tow 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
one Ohio
tow Ohio
three Ohio
four Nevada
five Nevada
six Nevada
Name: states, dtype: object
one 2000
tow 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
year 2002
states Ohio
pop 3.6
debt NaN
Name: three, dtype: object
year states pop debt
one 2000 Ohio 1.5 17.5
tow 2001 Ohio 1.7 17.5
three 2002 Ohio 3.6 17.5
four 2001 Nevada 2.4 17.5
five 2002 Nevada 2.9 17.5
six 2003 Nevada 3.2 17.5
year states pop debt
one 2000 Ohio 1.5 0.0
tow 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0
six 2003 Nevada 3.2 5.0
-----------------------------------------------------------------------------
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
a -5.3
b 7.2
c 3.6
d 4.5
e 0.0
dtype: float64
0 blue
2 purple
4 yellow
dtype: object
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
0 blue
1 purple
2 purple
3 yellow
4 yellow
5 NaN
dtype: object
Ohio Texas California
a 0 1 2
b 3 4 5
c 6 7 8
Ohio Texas California
a 0.0 1.0 2.0
b 3.0 4.0 5.0
c 6.0 7.0 8.0
d NaN NaN NaN
Texas Utah California
a 1 NaN 2
b 4 NaN 5
c 7 NaN 8
-----------------------------------------------------------------------------
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
a 0.0
b 1.0
e 4.0
dtype: float64
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
one two three four
Utah 8 9 10 11
New York 12 13 14 15
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14
-----------------------------------------------------------------------------
a 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
1.0
1.0
c 2.0
d 3.0
dtype: float64
b 1.0
a 0.0
c 2.0
dtype: float64
b 1.0
d 3.0
dtype: float64
a 0.0
b 1.0
dtype: float64
b 1.0
c 2.0
dtype: float64
a 0.0
b 5.0
c 5.0
d 3.0
dtype: float64
-----------------------------------------------------------------------------
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
Ohio 1
Colorado 5
Utah 9
New York 13
Name: two, dtype: int32
two three
Ohio 1 2
Colorado 5 6
Utah 9 10
New York 13 14
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
two 5
three 6
Name: Colorado, dtype: int32
four 11
one 8
two 9
Name: Utah, dtype: int32
four one two
Colorado 7 0 5
Utah 11 8 9
one two
Ohio 0 0
Colorado 0 5
Utah 8 9
New York 12 13
Ohio False
Colorado True
Utah True
New York True
Name: three, dtype: bool
one two
Colorado 0 5
Utah 8 9
New York 12 13
Process finished with exit code 0
Lambda
import pandas as pd
import numpy as np
np.random.seed(12345)
frame = pd.DataFrame(np.random.randn(4,3),
columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print('\n', np.abs(frame))
print('\n', frame)
f = lambda x:x.max() - x.min()
print('\n', frame.apply(f))
print('\n', frame.apply(f, axis=1))
def f1(x):
return pd.Series([x.min(), x.max()], index=['min', 'max'])
print('\n', frame.apply(f1))
format = lambda x: '%.2f' % x
print('\n', frame.applymap(format))
print('\n', frame['e'].map(format))
print("-----------------------------------------------------------------------------")
df = pd.DataFrame([[1.4, np.nan],[7.1, -4.5],[np.nan, np.nan],[0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
print('\n', df)
print("-----------------------------------------------------------------------------")
print('\n', df.sum())
print("-----------------------------------------------------------------------------")
print('\n', df.sum(axis=1))
print("-----------------------------------------------------------------------------")
print('\n', df.mean(axis=1, skipna=False))
print("-----------------------------------------------------------------------------")
print('\n', df.describe())
b d e
Utah 0.204708 0.478943 0.519439
Ohio 0.555730 1.965781 1.393406
Texas 0.092908 0.281746 0.769023
Oregon 1.246435 1.007189 1.296221
b d e
Utah -0.204708 0.478943 -0.519439
Ohio -0.555730 1.965781 1.393406
Texas 0.092908 0.281746 0.769023
Oregon 1.246435 1.007189 -1.296221
b 1.802165
d 1.684034
e 2.689627
dtype: float64
Utah 0.998382
Ohio 2.521511
Texas 0.676115
Oregon 2.542656
dtype: float64
b d e
min -0.555730 0.281746 -1.296221
max 1.246435 1.965781 1.393406
b d e
Utah -0.20 0.48 -0.52
Ohio -0.56 1.97 1.39
Texas 0.09 0.28 0.77
Oregon 1.25 1.01 -1.30
Utah -0.52
Ohio 1.39
Texas 0.77
Oregon -1.30
Name: e, dtype: object
-----------------------------------------------------------------------------
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
-----------------------------------------------------------------------------
one 9.25
two -5.80
dtype: float64
-----------------------------------------------------------------------------
a 1.40
b 2.60
c 0.00
d -0.55
dtype: float64
-----------------------------------------------------------------------------
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
-----------------------------------------------------------------------------
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
Process finished with exit code 0
'Data > Python' 카테고리의 다른 글
Machine Learning #6 Pandas Dataframe (0) | 2018.09.20 |
---|---|
Machine Learning #5 Matplotlib (0) | 2018.09.20 |
Machine Learning #3 numpy (0) | 2018.09.19 |
Machine Learning #2 python (0) | 2018.09.18 |
Machine Learning #1 python (0) | 2018.09.18 |