1. Pandas Dataframe 형식으로 읽어오기
import pandas as pd
df = pd.read_csv('./data_file/ex1.csv')
print('\n', df)
print('\n', pd.read_table('./data_file/ex1.csv', sep=','))
print('\n', pd.read_table('./data_file/ex2.csv', header=None, sep=','))
print('\n', pd.read_table('./data_file/ex2.csv', names=['a','b','c','d','message']))
names = ['a','b','c','d','message']
print('\n', pd.read_table('./data_file/ex2.csv', names=names, index_col='message'))
parsed = pd.read_csv('./data_file/csv_mindex.csv', index_col=['key1', 'key2'])
print('\n', parsed)
df2 = pd.read_csv('./data_file/ex4.csv', skiprows=[0,2,3])
print('\n', df2)
parameter = {'message':['foo', 'NA']}
df2 = pd.read_csv('./data_file/ex4.csv', skiprows=[0,2,3], na_values=parameter)
print('\n', df2)
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
a b c d message
0 1,2,3,4,hello NaN NaN NaN NaN
1 5,6,7,8,world NaN NaN NaN NaN
2 9,10,11,12,foo NaN NaN NaN NaN
a b c d
message
NaN 1,2,3,4,hello NaN NaN NaN
NaN 5,6,7,8,world NaN NaN NaN
NaN 9,10,11,12,foo NaN NaN NaN
value1 value2
key1 key2
one a 1 2
b 3 4
c 5 6
d 7 8
two a 9 10
b 11 12
c 13 14
d 15 16
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 NaN
2. JSON Read
import json
obj = """
{ "name":"Wes",
"places_lived":["United States", "Spain", "Germany"],
"pet":null,
"siblings":[{"name":"Scott", "age":"25", "pet":"Zuco"},
{"name":"Katie", "age":"33", "pet":"Cisco"}]
}
"""
result = json.loads(obj)
print('\n', result)
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
print('\n', siblings)
{'places_lived': ['United States', 'Spain', 'Germany'], 'name': 'Wes', 'siblings': [{'pet': 'Zuco', 'name': 'Scott', 'age': '25'}, {'pet': 'Cisco', 'name': 'Katie', 'age': '33'}], 'pet': None}
name age
0 Scott 25
1 Katie 33
Process finished with exit code 0
3. Merge
import pandas as pd
df1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],
'data1':range(7)})
df2 = pd.DataFrame({'key':['a','b','d'],
'data2':range(3)})
print('\n', df1)
print('\n', df2)
#내부 join 형태로 구성
print('\n', pd.merge(df1, df2, on='key'))
df3 = pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],
'data1':range(7)})
df4 = pd.DataFrame({'rkey':['a','b','d'],
'data2':range(3)})
print('\n', pd.merge(df3, df4, left_on='lkey', right_on='rkey'))
print('-----------------------------------------------------------------------')
#외부 join 형태로 구성
print('\n', pd.merge(df1, df2, how='left'))
print('\n', pd.merge(df1, df2, how='right'))
print('\n', pd.merge(df1, df2, how='outer'))
print('-----------------------------------------------------------------------')
left = pd.DataFrame({'key1':['foo','foo','bar'],
'key2':['one','two','one'],
'lval':[1,2,3]})
right = pd.DataFrame({'key1':['foo','foo','bar','bar'],
'key2':['one','one','one','two'],
'rval':[4,5,6,7]})
print('\n', left)
print('\n', right)
print('\n', pd.merge(left, right, on=['key1','key2'], how='outer'))
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
data2 key
0 0 a
1 1 b
2 2 d
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
-----------------------------------------------------------------------
data1 key data2
0 0 b 1.0
1 1 b 1.0
2 2 a 0.0
3 3 c NaN
4 4 a 0.0
5 5 a 0.0
6 6 b 1.0
data1 key data2
0 0.0 b 1
1 1.0 b 1
2 6.0 b 1
3 2.0 a 0
4 4.0 a 0
5 5.0 a 0
6 NaN d 2
data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
-----------------------------------------------------------------------
key1 key2 lval
0 foo one 1
1 foo two 2
2 bar one 3
key1 key2 rval
0 foo one 4
1 foo one 5
2 bar one 6
3 bar two 7
key1 key2 lval rval
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0
4. 재형성과 피벗
표현식의 데이터를 재배치하는 연산을 재형성 또는 피벗 연산이라고 한다.
DataFame 데이터 재배치
- stack: 데이터의 칼럼을 로우로 피벗 또는 회전 시킨다.
- unstack: 로우를 칼럼으로 피벗시킨다.
data = pd.DataFrame(np.arange(6).reshape((2,3)),
index=pd.Index(['Ohio', 'Colorado'], name='state'),
columns=pd.Index(['one', 'two', 'three'], name='number'))
print('\n', data)
result = data.stack()
print('\n', result)
print('\n', result.unstack())
print('\n', result.unstack(0))
print('\n', result.unstack(1))
print('\n', result.unstack('state'))
print('-----------------------------------------------------------------------')
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5
-----------------------------------------------------------------------
-----------------------------------------------------------------------
5.
data = pd.read_csv('./data_file/macrodata.csv')
#print('\n', data)
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
data = pd.DataFrame(data.to_records(),
columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
index=periods.to_timestamp('D','end'))
ldata = data.stack().reset_index().rename(columns={0:'value'})
print('\n', data.head(10))
print('\n', ldata[:10])
ldata['value2'] = np.random.randn(len(ldata))
print('\n', ldata[:10])
print('-----------------------------------------------------------------------')
item realgdp infl unemp
date
1959-03-31 2710.349 0.00 5.8
1959-06-30 2778.801 2.34 5.1
1959-09-30 2775.488 2.74 5.3
1959-12-31 2785.204 0.27 5.6
1960-03-31 2847.699 2.31 5.2
1960-06-30 2834.390 0.14 5.2
1960-09-30 2839.022 2.70 5.6
1960-12-31 2802.616 1.21 6.3
1961-03-31 2819.264 -0.40 6.8
1961-06-30 2872.005 1.47 7.0
date item value
0 1959-03-31 realgdp 2710.349
1 1959-03-31 infl 0.000
2 1959-03-31 unemp 5.800
3 1959-06-30 realgdp 2778.801
4 1959-06-30 infl 2.340
5 1959-06-30 unemp 5.100
6 1959-09-30 realgdp 2775.488
7 1959-09-30 infl 2.740
8 1959-09-30 unemp 5.300
9 1959-12-31 realgdp 2785.204
date item value value2
0 1959-03-31 realgdp 2710.349 -0.161222
1 1959-03-31 infl 0.000 1.683019
2 1959-03-31 unemp 5.800 -1.227292
3 1959-06-30 realgdp 2778.801 1.668738
4 1959-06-30 infl 2.340 -0.461753
5 1959-06-30 unemp 5.100 -1.438030
6 1959-09-30 realgdp 2775.488 -1.634590
7 1959-09-30 infl 2.740 0.058412
8 1959-09-30 unemp 5.300 2.300423
9 1959-12-31 realgdp 2785.204 2.059386
-----------------------------------------------------------------------
6.중복제거
data = pd.DataFrame({'k1':['one']*3 + ['two']*4,
'k2':[1,1,2,3,3,4,4]})
print('\n', data)
print('\n', data.duplicated())
print('\n', data.drop_duplicates())
data['v1'] = range(7)
print('\n', data)
print('\n', data.duplicated(['k1']))
print('\n', data.drop_duplicates(['k1', 'k2']))
print('-----------------------------------------------------------------------')
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
k1 k2 v1
0 one 1 0
1 one 1 1
2 one 2 2
3 two 3 3
4 two 3 4
5 two 4 5
6 two 4 6
0 False
1 True
2 True
3 False
4 True
5 True
6 True
dtype: bool
k1 k2 v1
0 one 1 0
2 one 2 2
3 two 3 3
5 two 4 5
-----------------------------------------------------------------------
7. 치환 & 특이값 찾아 제외
data = pd.DataFrame({'food':['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef',
'Bacon', 'pastrami', 'honey ham', 'nova lox'],
'ounces' :[4,3,12,6,7.5,8,3,5,6]})
print('\n', data)
print('-----------------------------------------------------------------------')
meat_to_animal = {
'bacon' : 'pig',
'pulled pork' : 'pig',
'pastrami' : 'cow',
'corned beef' : 'cow',
'honey ham' : 'pig',
'nova lox' : 'salmon'
}
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
print('\n', data)
print('\n', data['food'].map(lambda x:meat_to_animal[x.lower()]))
print('-----------------------------------------------------------------------')
#치환
data = pd.Series([1., -999., 2., -999, -1000, 3.])
print('\n', data)
print('\n', data.replace([-999,-1000], np.nan))
print('\n', data.replace({-999:np.nan, -1000:0}))
print('-----------------------------------------------------------------------')
#특이값 찾아 제외
np.random.seed(12345)
data = pd.DataFrame(np.random.randn(1000,4))
print('\n', data.describe())
print('-----------------------------------------------------------------------')
col = data[3]
print('\n', col)
print('\n', col[np.abs(col) > 3])
C:\Users\user\AppData\Local\conda\conda\envs\datastudy\python.exe C:/Users/user/Desktop/study/data_merge.py
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
-----------------------------------------------------------------------
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
-----------------------------------------------------------------------
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
-----------------------------------------------------------------------
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067684 0.067924 0.025598 -0.002298
std 0.998035 0.992106 1.006835 0.996794
min -3.428254 -3.548824 -3.184377 -3.745356
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.366626 2.653656 3.260383 3.927528
-----------------------------------------------------------------------
0 -0.555730
1 0.281746
2 -1.296221
3 0.886429
4 -0.438570
5 -1.021228
6 0.523772
7 -0.831154
8 0.560145
9 0.332883
10 -0.970736
11 -0.753887
12 0.246674
13 -0.919262
14 -0.660524
15 0.670216
16 -2.304234
17 1.074623
18 -0.503087
19 0.222896
20 0.433610
21 0.850591
22 2.169461
23 0.795253
24 0.152677
25 -0.929006
26 0.980928
27 0.457002
28 -0.402827
29 -1.648985
...
970 -1.460094
971 0.132262
972 -1.091849
973 -1.243078
974 -0.521155
975 -0.307670
976 1.381155
977 0.075798
978 0.102469
979 1.120895
980 -0.285176
981 0.053410
982 0.233619
983 0.604880
984 -0.335099
985 1.367213
986 -0.512542
987 -1.050942
988 -0.387002
989 1.642339
990 1.336648
991 0.153988
992 -1.466596
993 1.101972
994 0.604072
995 1.653126
996 -1.502870
997 0.133304
998 -0.303340
999 0.882755
Name: 3, Length: 1000, dtype: float64
97 3.927528
305 -3.399312
400 -3.745356
Name: 3, dtype: float64
'Data > Python' 카테고리의 다른 글
'머신러닝-딥러닝', 뭐가 다를까? (0) | 2018.10.22 |
---|---|
[Anaconda] 아나콘다 설치 및 사용법 for 맥 & 윈도우 (0) | 2018.09.27 |
Machine Learning #5 Matplotlib (0) | 2018.09.20 |
Machine Learning #4 Pandas (0) | 2018.09.19 |
Machine Learning #3 numpy (0) | 2018.09.19 |