[Machine Learning] Mnist, SVM

1 minute read

Mnist(숫자 손글씨) 분석 예제

import urllib.request as req
import gzip, os, os.path

savepath = "./mnist"
baseurl = "http://yann.lecun.com/exdb/mnist/"

files = [
    "train-images-idx3-ubyte.gz",
    "train-labels-idx1-ubyte.gz",
    "t10k-images-idx3-ubyte.gz",
    "t10k-labels-idx1-ubyte.gz"
]

if not os.path.exists(savepath): os.mkdir(savepath)
for f in files:
    url = baseurl + f
    loc = savepath + "/" + f
    print("download: ", url)
    if not os.path.exists(loc):
        req.urlretrieve(url, loc)

# Gzip 압축해제
for f in files:
    gz_file = savepath + "/" + f
    raw_file = savepath + "/" + f.replace(".gz", "")
    print("gzip:", f)
    with gzip.open(gz_file, "rb") as fp:
        body = fp.read()
        with open(raw_file, "wb") as w:
            w.write(body)
print("complete")  

  • 데이터베이스가 바이너리로 되어있어 분석하기 힘들다.
  • CSV로 변환해줍니다.


SVM

import random

# BMI 계산해서 레이블을 리턴하는 함수
def calc_bmi(h, w):
    bmi = w / (h/100) ** 2
    if bmi < 18.5: return "thin"
    if bmi < 25: return "normal"
    return "fat"

# 출력 파일 준비하기
fp = open("bmi.csv", "w", encoding = "utf-8")
fp.write("height,weight,label\n")

# 무작위로 데이터 생성해서 저장
cnt = {"thin":0, "normal":0, "fat":0}
for i in range(20000):
    h = random.randint(120, 200)
    w = random.randint(35, 80)
    label = calc_bmi(h, w)
    cnt[label] += 1
    fp.write("{0},{1},{2}\n".format(h,w,label))
fp.close()
print("ok", cnt)


from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd

# 키와 몸무게 데이터 읽어 들이기
tbl = pd.read_csv("bmi.csv")

# 열 나누고 정규화
label = tbl["label"]
w = (tbl["weight"] - tbl["weight"].min()) / (tbl["weight"].max() - tbl["weight"].min())
h = (tbl["height"] - tbl["height"].min()) / (tbl["height"].max() - tbl["height"].min())
wh = pd.concat([w, h], axis=1)


Leave a comment